# Employee Turnover Prediction Analysis - Kaggle Approach

## Step 1 - Problem Definition and Target Variable

**Problem to solve:**
We want to predict whether an employee will leave the company (event = 1) or not (event = 0) based on their personal and work characteristics.

**Target variable:** `event` (binary: 0 = no turnover, 1 = turnover)

**Research question:** 
What employee characteristics (anxiety, extraversion, independence, self-control, age, tenure, etc.) are the best predictors to determine if an employee will leave the company?

**Approach:** We will use machine learning techniques to create a predictive model that allows the company to identify employees at risk of leaving the organization.

In [None]:
# Step 2 - Load and display data

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load the dataset with correct encoding
df = pd.read_csv('turnover.csv', encoding='latin1')

print("=== BASIC DATASET INFORMATION ===")
print(f"Dataset dimensions: {df.shape}")
print("\n=== FIRST 5 ROWS ===")
df.head()

## Step 2 (continued) - Detailed dataset information

In [None]:
# Detailed dataset information
print("DATASET INFORMATION")
print(df.info())

print(f"\nTurnover distribution: {df['event'].value_counts().values}")
print(f"Turnover percentage: {(df['event'].sum() / len(df)) * 100:.1f}%")

print(f"\nNumeric columns: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical columns: {len(df.select_dtypes(include=['object']).columns)}")

In [None]:
# Step 3 - Exploratory Data Analysis (EDA)

# Statistical summary and null values
print("STATISTICAL SUMMARY")
print(df.describe())
print(f"\nNull values: {df.isnull().sum().sum()}")

# Histograms of numeric variables
numeric_cols = df.select_dtypes(include=[np.number]).columns
n_cols = len(numeric_cols)
n_rows = (n_cols + 2) // 3

plt.figure(figsize=(15, 5 * n_rows))
for i, col in enumerate(numeric_cols):
    plt.subplot(n_rows, 3, i + 1)
    plt.hist(df[col], bins=20, alpha=0.7, edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## Step 3 (continued) - Correlations and categorical analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix - Numeric Variables')
plt.tight_layout()
plt.show()

# Correlation with target variable
correlations_with_target = df.select_dtypes(include=[np.number]).corr()['event'].sort_values(ascending=False)
print("CORRELATION WITH TURNOVER")
print(correlations_with_target.round(3))

# Analysis of main categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols[:3]:  # Only first 3 to keep it simple
    print(f"\nTurnover by {col}:")
    print(df.groupby(col)['event'].mean().round(3))

## Step 4 - Data cleaning and preparation

In [None]:
# Step 4 - Data cleaning and preparation

# Check null values
print(f"Null values: {df.isnull().sum().sum()}")

# Encode categorical variables
df_processed = df.copy()
categorical_cols = df_processed.select_dtypes(include=['object']).columns

if len(categorical_cols) > 0:
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        label_encoders[col] = le

# Separate features and target variable
X = df_processed.drop('event', axis=1)
y = df_processed['event']

print(f"Data prepared - X: {X.shape}, y: {y.shape}")
X.head()

## Step 5 - Data splitting

In [None]:
# Step 5 - Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("DATA SPLITTING")
print(f"Training: {X_train.shape[0]} samples")
print(f"Testing: {X_test.shape[0]} samples")
print(f"Turnover in training: {(y_train.sum() / len(y_train)) * 100:.1f}%")
print(f"Turnover in testing: {(y_test.sum() / len(y_test)) * 100:.1f}%")

## Step 6 - Base model (Random Forest)

In [None]:
# Step 6 - Base model (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print("RANDOM FOREST - METRICS")
print(f"Accuracy: {rf_accuracy:.3f}")
print(f"Precision: {rf_precision:.3f}")
print(f"Recall: {rf_recall:.3f}")
print(f"F1-Score: {rf_f1:.3f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTOP 5 MOST IMPORTANT FEATURES")
print(feature_importance.head().to_string(index=False))

## Step 7 - Second model and comparison (Logistic Regression)

In [None]:
# Step 7 - Second model (Logistic Regression) and comparison
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

# Model comparison
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'Logistic Regression'],
    'Accuracy': [rf_accuracy, lr_accuracy],
    'Precision': [rf_precision, lr_precision],
    'Recall': [rf_recall, lr_recall],
    'F1-Score': [rf_f1, lr_f1]
})

print("MODEL COMPARISON")
print(comparison_df.round(3).to_string(index=False))

best_model_idx = comparison_df['F1-Score'].idxmax()
best_model = comparison_df.loc[best_model_idx, 'Model']
print(f"\nBest model: {best_model}")

# Simplified visualization
plt.figure(figsize=(10, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.35

plt.bar(x - width/2, [rf_accuracy, rf_precision, rf_recall, rf_f1], width, 
        label='Random Forest', alpha=0.8)
plt.bar(x + width/2, [lr_accuracy, lr_precision, lr_recall, lr_f1], width, 
        label='Logistic Regression', alpha=0.8)

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.ylim(0, 1)
plt.tight_layout()
plt.show()

## Analysis Conclusions

### Summary of the 7 completed steps:

1. **Problem definition**: Predict employee turnover (variable `event`)
2. **Loading and exploration**: Dataset of 1,129 employees with 16 characteristics
3. **EDA**: Statistical analysis, correlations and distributions
4. **Preparation**: Encoding of 8 categorical variables
5. **Splitting**: 80% training, 20% testing with stratification
6. **Base model**: Random Forest with F1-Score of 0.723
7. **Comparison**: Random Forest outperforms Logistic Regression

### Main findings:
- Most important variables: tenure (stag), age, industry
- Random Forest: 70.8% accuracy, F1-Score: 0.723
- Logistic Regression: 52.7% accuracy, F1-Score: 0.524
- Balanced dataset (50.6% turnover) facilitates training

### Recommendations:
- Use Random Forest as production model
- Monitor employees with high tenure
- Implement alert system based on model probabilities

## Practical Example - Prediction for New Employees

Let's create examples of fictional employees and use our best model (Random Forest) to predict if they are at risk of turnover.

In [None]:
# Create 5 example employees with different profiles
example_employees = pd.DataFrame({
    'stag': [5.5, 45.0, 12.3, 78.2, 2.1],
    'gender': [0, 1, 0, 1, 0],  # 0=female, 1=male
    'age': [28, 45, 35, 52, 24],
    'industry': [5, 2, 8, 1, 12],
    'profession': [3, 7, 1, 9, 4],
    'traffic': [2, 5, 1, 3, 4],
    'coach': [1, 0, 2, 1, 0],
    'head_gender': [0, 1, 1, 0, 1],
    'greywage': [0, 1, 0, 1, 0],
    'way': [1, 2, 0, 1, 2],
    'extraversion': [6.2, 4.8, 7.1, 3.5, 8.2],
    'independ': [5.8, 6.9, 4.3, 7.2, 5.1],
    'selfcontrol': [6.1, 7.8, 5.2, 8.1, 4.9],
    'anxiety': [4.5, 7.2, 5.8, 8.5, 3.2],
    'novator': [6.8, 5.1, 7.3, 4.2, 8.1]
})

# Make predictions with Random Forest model
predictions = rf_model.predict(example_employees)
probabilities = rf_model.predict_proba(example_employees)

# Create DataFrame with results
results = pd.DataFrame({
    'Employee': [f'Employee_{i+1}' for i in range(len(example_employees))],
    'Age': example_employees['age'].values,
    'Tenure': example_employees['stag'].values,
    'Anxiety': example_employees['anxiety'].values,
    'Prediction': ['Turnover' if p == 1 else 'Stays' for p in predictions],
    'Turnover_Probability': [round(prob[1], 3) for prob in probabilities],
    'Risk': ['High' if prob[1] > 0.7 else 'Medium' if prob[1] > 0.4 else 'Low' 
               for prob in probabilities]
})

print("Predictions for example employees:")
print(results.to_string(index=False))

In [None]:
# Function to predict turnover for an individual employee
def predict_turnover(model, stag, age, anxiety, extraversion, independ, 
                     selfcontrol, novator, gender=0, industry=5, profession=3,
                     traffic=2, coach=1, head_gender=0, greywage=0, way=1):
    """
    Predicts turnover probability for an individual employee
    
    Main parameters (most important according to the model):
    - stag: Tenure in months
    - age: Age
    - anxiety: Anxiety level (1-10)
    - extraversion: Extraversion level (1-10)
    - independ: Independence level (1-10)
    - selfcontrol: Self-control level (1-10)
    - novator: Innovation level (1-10)
    """
    
    employee = pd.DataFrame({
        'stag': [stag], 'gender': [gender], 'age': [age], 'industry': [industry],
        'profession': [profession], 'traffic': [traffic], 'coach': [coach],
        'head_gender': [head_gender], 'greywage': [greywage], 'way': [way],
        'extraversion': [extraversion], 'independ': [independ], 
        'selfcontrol': [selfcontrol], 'anxiety': [anxiety], 'novator': [novator]
    })
    
    prediction = model.predict(employee)[0]
    probability = model.predict_proba(employee)[0][1]
    
    result = "Turnover" if prediction == 1 else "Stays"
    risk = "High" if probability > 0.7 else "Medium" if probability > 0.4 else "Low"
    
    return {
        'prediction': result,
        'turnover_probability': round(probability, 3),
        'risk_level': risk
    }

# Individual usage example
example_employee = predict_turnover(
    model=rf_model,
    stag=84.5,      # 2 years of tenure
    age=32,         # 32 years old
    anxiety=7.2,    # High anxiety
    extraversion=5.5,
    independ=6.0,
    selfcontrol=4.8,
    novator=6.5
)

print("Prediction for individual employee:")
for key, value in example_employee.items():
    print(f"  {key}: {value}")

In [None]:
# Instead of predicting for 5 employees, predict only for John Smith
john_smith = predict_turnover(
    model=rf_model,
    stag=1,      # John has 8 months in the company
    age=36,         # John is 36 years old
    anxiety=6.2,    # John has high anxiety
    extraversion=8.5, # John has high extraversion
    independ=7.0,   # John has high independence
    selfcontrol=7.8, # John has high self-control
    novator=9.5     # John has high innovation
)
print("\nPrediction for John Smith:")
for key, value in john_smith.items():
    print(f"  {key}: {value}")

# Result: Only for John
# prediction: Turnover
# turnover_probability: 0.653
# risk_level: Medium

In [None]:
# Feature importance analysis - Simple code

# 1. Random Forest importance (already calculated)
print("TOP 10 MOST IMPORTANT VARIABLES:")
print("-" * 40)
top_features = feature_importance.head(10)
for i, row in top_features.iterrows():
    print(f"{row['Feature']:15s}: {row['Importance']:.3f}")

# 2. Simple importance visualization
plt.figure(figsize=(10, 6))
top_10 = feature_importance.head(10)
plt.barh(range(len(top_10)), top_10['Importance'])
plt.yticks(range(len(top_10)), top_10['Feature'])
plt.xlabel('Importance')
plt.title('Variables that most influence prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# 3. Simple correlation with turnover
print("\nCORRELATION WITH TURNOVER:")
print("-" * 30)
correlations = df.select_dtypes(include=[np.number]).corr()['event'].abs().sort_values(ascending=False)
for var, corr in correlations.head(8).items():
    if var != 'event':
        print(f"{var:15s}: {corr:.3f}")

# 4. Quick analysis by groups (categorical variables)
print("\nTURNOVER DIFFERENCE BY GROUPS:")
print("-" * 35)
categorical_vars = ['gender', 'industry', 'profession', 'traffic']
for var in categorical_vars:
    if var in df.columns:
        turnover_by_group = df.groupby(var)['event'].mean()
        max_diff = turnover_by_group.max() - turnover_by_group.min()
        print(f"{var:15s}: maximum difference {max_diff:.3f}")