#### Importing Libraries

In [84]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


#### Loading Data

In [85]:
df=pd.read_csv("../Dataset/HR_comma_sep.csv.crdownload", encoding = 'utf-8')

In [86]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [87]:
def reading_cleaning(df):
    cols=df.columns.tolist()
    df.columns=[x.lower() for x in cols]
    df.drop_duplicates(inplace=True)
    
    return df
df = reading_cleaning(df)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


#### Provide details regarding Everation employees, using a different approach.

In [88]:
def employee_important_info(df):
    # Average satisfaction level
    average_satisfaction = df['satisfaction_level'].mean()
    # Department-wise average satisfaction level
    department_satisfaction = df.groupby('department')['satisfaction_level'].mean()
    # Salary-wise average satisfaction level
    salary_satisfaction = df.groupby('salary')['satisfaction_level'].mean()

    # Employees who left
    left_employees = len(df[df['left'] == 1])
    # Employees who stayed
    stayed_employees = len(df[df['left'] == 0])
    
    return average_satisfaction,department_satisfaction,salary_satisfaction,left_employees,stayed_employees

In [89]:
average_satisfaction,department_satisfaction,salary_satisfaction,left_employees,stayed_employees = employee_important_info(df)

In [90]:
print("Average Satisfaction Level:", average_satisfaction)
print("Department-wise Average Satisfaction Level:\n", department_satisfaction)
print("Salary-wise Average Satisfaction Level:\n", salary_satisfaction)
print("Employees who left:\n", left_employees)
print("Employees who stayed:\n", stayed_employees)

Average Satisfaction Level: 0.6025083272990586
Department-wise Average Satisfaction Level:
 department
IT             0.596568
RandD          0.606976
accounting     0.578370
hr             0.595282
management     0.599833
marketing      0.605653
product_mng    0.607022
sales          0.601585
support        0.614926
technical      0.601670
Name: satisfaction_level, dtype: float64
Salary-wise Average Satisfaction Level:
 salary
h         0.240000
high      0.627799
low       0.592670
medium    0.610444
Name: satisfaction_level, dtype: float64
Employees who left:
 1991
Employees who stayed:
 4914


#### Pie Plot for Different Features

In [91]:
import plotly.graph_objects as go
import pandas as pd

def plots_3d(df, col):
    values = df[col].unique()
    labels = [f'{value} ({col})' for value in values]
    counts = df[col].value_counts()

    fig = go.Figure(data=[go.Pie(labels=labels, values=counts, hole=0.3)])

    # Set layout
    fig.update_layout(
        title=f"Distribution of {col}",
        scene=dict(
            aspectmode="cube",
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=1.5)
            )
        ),
        height=600,
        width=800
    )

    # Save the plot as HTML file
    fig.write_html("left.html")

    # Show the plot
    fig.show()

# Assuming 'df' is your DataFrame and 'left' is a column in it
plots_3d(df, 'left')

In [92]:
def save_3d_pie_chart(df, col):
    fig = go.Figure(data=[go.Pie(labels=[f'{value} ({col})' for value in df[col].unique()], values=df[col].value_counts(), hole=0.3)])
    fig.update_layout(title=f"Distribution of {col}", scene=dict(aspectmode="cube", camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))), height=600, width=800)
    fig.write_html("salary.html")
    fig.show()

# Assuming 'df' is your DataFrame and 'salary' is a column in it
save_3d_pie_chart(df, 'salary')

In [93]:
def save_3d_pie_chart(df, col):
    fig = go.Figure(data=[go.Pie(labels=[f'{value} ({col})' for value in df[col].unique()], values=df[col].value_counts(), hole=0.3)])
    fig.update_layout(title=f"Distribution of {col}", scene=dict(aspectmode="cube", camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))), height=600, width=800)
    fig.write_html("number_project.html")
    fig.show()

# Assuming 'df' is your DataFrame and 'number_project' is a column in it
save_3d_pie_chart(df, 'number_project')

In [94]:
def save_3d_pie_chart(df, col):
    fig = go.Figure(data=[go.Pie(labels=[f'{value} ({col})' for value in df[col].unique()], values=df[col].value_counts(), hole=0.3)])
    fig.update_layout(title=f"Distribution of {col}", scene=dict(aspectmode="cube", camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))), height=600, width=800)
    fig.write_html(f"{col}.html")
    fig.show()

# Assuming 'df' is your DataFrame and 'department' is a column in it
save_3d_pie_chart(df, 'department')

#### Employee Distribution

In [95]:
import plotly.express as px

def distribution_3d(df, col):
    fig = px.scatter_3d(df, x=col, y='satisfaction_level', z='last_evaluation', color='left',
                         symbol='left', opacity=0.7, size_max=10)

    # Update layout
    fig.update_layout(
        title=f"Distribution of {col}",
        scene=dict(
            xaxis_title=col.capitalize(),
            yaxis_title='Satisfaction Level',
            zaxis_title='Last Evaluation'
        ),
        showlegend=True,
        height=600,
        width=800
    )

    # Save the plot as an HTML file with the desired name
    file_name = f"Distribution of {col}.html"
    fig.write_html(file_name)
    
    # Show the plot
    fig.show()

# Assuming 'df' is your DataFrame
distribution_3d(df, 'salary')

In [96]:
def distribution_3d(df, col):
    fig = px.scatter_3d(df, x=col, y='satisfaction_level', z='last_evaluation', color='left',
                         symbol='left', opacity=0.7, size_max=10)

    # Update layout
    fig.update_layout(
        title=f"Distribution of {col}",
        scene=dict(
            xaxis_title=col.capitalize(),
            yaxis_title='Satisfaction Level',
            zaxis_title='Last Evaluation'
        ),
        showlegend=True,
        height=600,
        width=800
    )

    # Save the plot as an HTML file with the desired name
    file_name = f"Distribution of {col}.html"
    fig.write_html(file_name)
    
    # Show the plot
    fig.show()

# Assuming 'df' is your DataFrame
distribution_3d(df, 'department')

#### Comparison

In [97]:
def comparison_3d(df, x, y):
    fig = px.scatter_3d(df, x=x, y=y, z='last_evaluation', color='left',
                         symbol='left', opacity=0.7, size_max=10)

    # Update layout
    fig.update_layout(
        title=f'{x.capitalize()} vs {y.capitalize()} vs Last Evaluation Comparison',
        scene=dict(
            xaxis_title=x.capitalize(),
            yaxis_title=y.capitalize(),
            zaxis_title='Last Evaluation'
        ),
        showlegend=True,
        height=600,
        width=800
    )

    # Save the plot as an HTML file with the desired name
    file_name = "Deptvsslvslec.html"
    fig.write_html(file_name)
    
    # Show the plot
    fig.show()

# Assuming 'df' is your DataFrame
comparison_3d(df, 'department', 'satisfaction_level')

#### Feature Correlation (Left)

In [98]:
def corr_with_left_3d(df):
    # One-hot encode categorical variables
    df_encoded = pd.get_dummies(df)
    
    # Calculate correlations with the 'left' column
    correlations = df_encoded.corr()['left'].sort_values(ascending=False)[:-1]
    
    # Determine colors based on correlation values
    colors = ['skyblue' if corr >= 0 else 'salmon' for corr in correlations]
    
    # Create the figure
    fig = go.Figure(data=[go.Bar(x=correlations.values, y=correlations.index, marker=dict(color=colors), orientation='h')])
    
    # Update layout
    fig.update_layout(
        title='Correlation with Left',
        xaxis_title='Correlation',
        yaxis_title='Features',
        height=800,
        width=1000
    )
    
    # Save the plot as an HTML file with the desired name
    file_name = "feature_correlation.html"
    fig.write_html(file_name)
    
    # Show the plot
    fig.show()

# Assuming 'df' is your DataFrame
corr_with_left_3d(df)

#### Churn Analysis

This code presents two histograms side by side for better visualization. The first histogram shows how data in a specific column ('col') is distributed across different ranges. Each bar's color represents whether the corresponding data point led to an employee leaving or staying ('left' column).

The second histogram is a smoother representation of the relationship between 'satisfaction_level' and 'last_evaluation'. This plot helps to understand if there's any correlation between these two factors and employee turnover. Again, the colors indicate whether the employees left or stayed.

In [99]:
def histogram_3d(df, col):
    # Create the 3D histogram for Satisfaction vs Evaluation
    fig = go.Figure(data=[go.Histogram2d(x=df['satisfaction_level'], y=df['last_evaluation'], 
                                          colorscale='viridis')])
    
    # Update layout for Satisfaction vs Evaluation plot
    fig.update_layout(title="Satisfaction vs Evaluation",
                      scene=dict(xaxis_title="Satisfaction Level",
                                 yaxis_title="Last Evaluation"))
    
    # Save the Satisfaction vs Evaluation plot as an HTML file
    fig.write_html("satisvseval.html")
    
    # Show the Satisfaction vs Evaluation plot
    fig.show()
    
    # Create a 2D histogram for Distribution of satisfaction_level
    fig2 = go.Figure(data=[go.Histogram(x=df[col], 
                                        marker_color='rgba(140, 240, 140, 0.7)')])

    # Update layout for Distribution of satisfaction_level plot
    fig2.update_layout(title=f"Distribution of {col}",
                       xaxis_title=col.capitalize(),
                       yaxis_title="Count")
    
    # Save the Distribution of satisfaction_level plot as an HTML file
    fig2.write_html("Satisfaction_level.html")
    
    # Show the Distribution of satisfaction_level plot
    fig2.show()

# Call the function with 'satisfaction_level' as the column of interest
histogram_3d(df, 'satisfaction_level')

#### K-Means Clustering

In [100]:
from sklearn.cluster import KMeans

def leftKmeans_3d(df):
    # Filter DataFrame for employees who left
    df_left = df[df['left'] == 1].copy()

    # Define features for clustering
    features = ['satisfaction_level', 'last_evaluation']

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=2)
    df_left['kmean_label'] = kmeans.fit_predict(df_left[features])

    # Create 3D scatter plot for clusters
    fig = go.Figure()

    # Add traces for each cluster
    for cluster in df_left['kmean_label'].unique():
        cluster_data = df_left[df_left['kmean_label'] == cluster]
        fig.add_trace(go.Scatter3d(x=cluster_data['satisfaction_level'],
                                   y=cluster_data['last_evaluation'],
                                   z=[cluster] * len(cluster_data),
                                   mode='markers',
                                   marker=dict(color=cluster,
                                               size=5),
                                   name=f'Cluster {cluster}'))

    # Set layout
    fig.update_layout(scene=dict(xaxis_title='Satisfaction Level',
                                 yaxis_title='Evaluation',
                                 zaxis_title='Cluster'),
                      title='Clusters of Employee Churn',
                      showlegend=True,
                      legend=dict(title='Cluster'),
                      margin=dict(l=0, r=0, t=40, b=0),  # Adjust margin for better appearance
                      scene_camera=dict(eye=dict(x=1.25, y=-1.25, z=1.25)))  # Adjust camera position

    # Add annotation for cluster centers
    for cluster, center in enumerate(kmeans.cluster_centers_):
        fig.add_trace(go.Scatter3d(x=[center[0]], y=[center[1]], z=[cluster],
                                    mode='markers',
                                    marker=dict(color='black',
                                                size=10,
                                                symbol='x'),
                                    showlegend=False,
                                    name=f'Cluster {cluster} Center'))

    # Save the plot as an HTML file
    fig.write_html("kmeans.html")

    # Show the plot
    fig.show()

    return df_left

df1 = leftKmeans_3d(df)

In [101]:
# Filter non-numeric columns
numeric_columns = df1.select_dtypes(include=['number']).columns

# Calculate the mean for numeric columns
df1_mean = df1.groupby('kmean_label')[numeric_columns].mean()

In [102]:
df1_mean.head()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,kmean_label
kmean_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.410133,0.517195,2.172949,149.640798,3.06541,0.053215,1.0,0.005543,0.0
1,0.805874,0.912577,4.511712,242.003604,5.012613,0.052252,1.0,0.003604,1.0
2,0.111199,0.869064,6.11985,271.840824,4.08427,0.052434,1.0,0.001873,2.0
