In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy import stats
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from scipy.stats import ttest_ind


In [None]:
data = pd.read_csv('Employee_Attrition_Data_Set.csv')  #todo:数据集地址

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Search for all attributes that may be dates
date_columns = [col for col in data.columns if data[col].astype(str).str.contains(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b', na=False).any()]
# Convert date-formatted columns to datetime types
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')
# Print the converted data type
#todo:删除上面所有然后写data['A']=pd.to_datetime(data['A']) 或者删除这个todo
print(data.dtypes)

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(data.isnull(), cbar=True, cmap='viridis')
plt.title('Heat map of missing values')
plt.show()

In [None]:
# Check the percentage of missing values in each column
missing_ratio = data.isnull().mean()
# Delete columns with more than 50% missing values
cols_to_drop = missing_ratio[missing_ratio > 0.5].index
data.drop(columns=cols_to_drop, inplace=True)
data.drop(columns=['Employee ID'], inplace=True)  #todo:删除员工ID等无意义的属性 columns=['A','B',....]

In [None]:
# Fill in missing values
for column in data.columns:
    if data[column].dtype == 'object':  # For string columns
        data[column].fillna(data[column].mode()[0], inplace=True)
    elif np.issubdtype(data[column].dtype, np.number):  # For numeric columns
        data[column].fillna(data[column].mean(), inplace=True)
    elif np.issubdtype(data[column].dtype, np.datetime64):  # For the time column
        data[column].fillna(data[column].median(), inplace=True)

In [None]:
num_features = data.select_dtypes(include=[np.number]).columns
data[num_features].hist(figsize=(12, 8), bins=15, edgecolor='black')
plt.suptitle('Histogram of numerical characteristics')
plt.show()

In [None]:
num_features = data.select_dtypes(include=[np.number])
correlation = num_features.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('correlation heat map')
plt.show()

In [None]:
# Plotting distributions of non-numeric (categorical) attributes
object_columns = data.select_dtypes(include=['object']).columns
# Create a figure for the plots
plt.figure(figsize=(15, 10))
# Iterate over the categorical columns and create a subplot for each
for i, col in enumerate(object_columns):
    # Create a subplot for the current column
    plt.subplot(len(object_columns), 1, i + 1)
    # Plot the count distribution of the current column
    sns.countplot(y=data[col], palette='viridis')
    # Set the title and labels for the subplot
    plt.title(f'Distribution of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
# Adjust layout to prevent overlap
plt.tight_layout()
# Display the plot
plt.show()


In [None]:
sns.boxplot(data['Monthly Income'])  #todo:数值

In [None]:
data.groupby('Department')['Monthly Income'].sum().sort_index().plot()  #todo:标签，数值

In [None]:
data.groupby('Job Role')['Last Raise Percentage'].mean().sort_index().plot()  #todo:标签，数值

In [None]:
sns.boxplot(x='Attrition', y='Monthly Income', data=data)  #todo:目标变量，数值

In [None]:
plt.hist(data['Distance from Office'], alpha=0.7, color='blue')  #todo:数值

In [None]:
column = 'Monthly Income'  #todo:数值
stats.probplot(data[column], dist='norm', plot=plt)
print(stats.skew(data[column]))
print(stats.kurtosis(data[column], fisher=True))

In [None]:
column = 'Monthly Income'  #todo:数值
stats.probplot(data[column], dist='expon', plot=plt)

In [None]:
data.groupby(['Job Role', 'Department'])['Age'].count().unstack().plot(kind='bar', stacked=True)  #todo:标签，标签，数值

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 5))
sns.boxplot(x='Department', y='Monthly Income', data=data, ax=ax1, palette='tab10')  #todo:标签A，数值B
sns.boxplot(x='Department', y='Years at Company', data=data, ax=ax2, palette='tab10')  #todo:标签A，数值C
sns.boxplot(x='Department', y='Number of Promotions', data=data, ax=ax3, palette='tab10')  #todo:标签A，数值D

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(25, 10))
sns.boxplot(x='Job Role', y='Last Raise Percentage', hue='Attrition', data=data, ax=ax1, palette='tab10')  #todo:标签A，数值B，目标变量
sns.boxplot(x='Job Role', y='Distance from Office', hue='Attrition', data=data, ax=ax2, palette='tab10')  #todo:标签A，数值C，目标变量
sns.boxplot(x='Job Role', y='Job Satisfaction', hue='Attrition', data=data, ax=ax3, palette='tab10')  #todo:标签A，数值D，目标变量

In [None]:
left_employees = data[data['Attrition'] == 'Yes']  #todo:目标变量 ，目标变量取值1(在这里Attrition取值不是Yes就是No)
stayed_employees = data[data['Attrition'] == 'No']  #todo:目标变量， 目标变量取值2
numerical_cols = data.select_dtypes(include='number').columns
ttest_results = {}
for col in numerical_cols:
    t_stat, p_value = ttest_ind(left_employees[col], stayed_employees[col], nan_policy='omit')
    ttest_results[col] = {'t_stat': t_stat, 'p_value': p_value}
# Convert results to DataFrame for better readability
ttest_results_df = pd.DataFrame(ttest_results).T
ttest_results_df

In [None]:
X, y = data.drop(columns=['Attrition']), data['Attrition']  #todo:目标变量
y = y.map({'Yes': 1, 'No': 0})  #todo:把标签换成数值
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Select columns with categorical data
categorical_features = X.select_dtypes(include=['object']).columns

# Define a pipeline for transforming categorical features
categorical_transformer = Pipeline(steps=[
    # Apply one-hot encoding to handle categorical features
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


In [None]:
# Select columns with numerical data
numerical_features = X.select_dtypes(include=['number']).columns
# Define a pipeline for transforming numerical features
numerical_transformer = Pipeline(steps=[
    # Apply standard scaling to normalize numerical features
    ('scaler', StandardScaler())
])


In [None]:
# Combine numerical and categorical transformers into a single preprocessor
preprocessor = ColumnTransformer(transformers=[
    # Apply the numerical transformer to numerical features
    ('num', numerical_transformer, numerical_features),
    # Apply the categorical transformer to categorical features
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__solver': ['liblinear']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'classifier__n_neighbors': [5, 7, 9, 15, 35, 45, 55],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__metric': ['euclidean', 'manhattan', 'minkowski']
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'params': {
            'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_features': ['sqrt', 'log2'],
            'classifier__max_depth': [None, 10, 20, 30]
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'classifier__criterion': ['gini', 'entropy'],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'Support Vector Machine': {  #todo:支持向量机很费时间，时间不够可以把这个扔掉
        'model': SVC(),
        'params': {  #todo:也可以删掉一些组合，一共三行参数，只要有一行参数数量大于2就行,[0.1, 1, 10, 100]是四个参数
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'classifier__gamma': ['scale', 'auto']
        }
    }
}

In [None]:
# Initialize a dictionary to store the best models
best_models = {}

# Define a list of evaluation metrics
metrics = ['accuracy', 'precision', 'recall', 'f1']

# Initialize a dictionary to store the results for each metric
results = {metric: [] for metric in metrics}

# Initialize a list to store the names of the models
model_names = []


In [None]:
for name, model_params in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model_params['model'])])
    param_grid = model_params['params']
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, scoring='f1')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    y_pred = grid_search.predict(X_test)

    # Store the model name
    model_names.append(name)

    # Calculate and store indicators
    results['accuracy'].append(accuracy_score(y_test, y_pred))
    results['precision'].append(precision_score(y_test, y_pred, average='macro'))
    results['recall'].append(recall_score(y_test, y_pred, average='macro'))
    results['f1'].append(f1_score(y_test, y_pred, average='macro'))

    print(f'Best parameters for {name}: {grid_search.best_params_}')

In [None]:
# Convert the results dictionary to a DataFrame
results_data = pd.DataFrame(results, index=model_names)
# Transpose the DataFrame so that models are columns and metrics are rows
results_data_transposed = results_data.T
# Plot results
plt.figure(figsize=(10, 6))
for model in results_data_transposed.columns:
    plt.plot(results_data_transposed.index, results_data_transposed[model], marker='o', label=model)
plt.xlabel('Evaluation Metrics')
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Create a figure with a 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# Flatten the axes array for easy iteration
axes = axes.flatten()
# Iterate over the axes and the best models
for ax, (name, model) in zip(axes, best_models.items()):
    # Predict the target values using the model
    y_pred = model.predict(X_test)
    # Compute the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # Plot the confusion matrix as a heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    # Set the title and labels for the subplot
    ax.set_title(f'Confusion Matrix for {name}')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
# Adjust layout to prevent overlap
plt.tight_layout()
# Display the plot
plt.show()


In [None]:
best_model_index = results['f1'].index(max(results['f1']))
best_model_name = model_names[best_model_index]
best_model = best_models[best_model_name]
# Predicted probability
y_prob = best_model.predict_proba(X_test)[:, 1]
# Calculate ROC curves and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
# Plotting ROC curves
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'Receiver Operating Characteristic (ROC) for {best_model_name}')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
print(f'The best model is: {best_model_name} with an AUC of {roc_auc:.2f}')
