In [1]:
import pandas as pd
import numpy as np

In [3]:
# Read the training and test data
train_df = pd.read_csv('../data/employee_data_train.csv')
test_df = pd.read_csv('../data/employee_data_test.csv')

In [5]:
# Combine train and test data for consistent preprocessing
combined_df = pd.concat([train_df, test_df], sort=False)

# Drop irrelevant columns
drop_cols = ['Employee Name', 'Employee ID', 'Start Date', 'End Date']
combined_df = combined_df.drop(drop_cols, axis=1)

In [7]:
def feature_engineering(df):
    """
    Performs feature engineering on the provided DataFrame by calculating new metrics related to salary,
    tenure, and promotion history. This prepares the data for input to an attrition prediction model.

    Parameters:
        df (DataFrame): The input DataFrame containing employee data.
    
    Returns:
        DataFrame: The transformed DataFrame with additional features and cleaned columns.
    """
    # Drop columns irrelevant to model training or prediction
    df = df.drop(['Full Name', 'ID', 'Start Date', 'End Date'], axis=1, errors='ignore')
    
    # Calculate percentage change in salary from starting to current salary
    df['Salary Percentage Change'] = (df['Current Salary'] - df['Starting Salary']) / df['Starting Salary']
 
    # Salary Raise Per Year calculation (handles zero tenure)
    # Avoid division by zero by adding a small value where Tenure is zero
    epsilon = 1e-6
    df['Adjusted Tenure'] = df['Tenure'].apply(lambda x: x if x > 0 else epsilon)
    df['Salary Raise Per Year'] = (df['Current Salary'] - df['Starting Salary']) / df['Adjusted Tenure']

    # Calculate promotion frequency over tenure period
    df['Promotion Frequency'] = df['Promotion History'] / df['Adjusted Tenure']

    # Drop the temporary Adjusted Tenure column
    df = df.drop('Adjusted Tenure', axis=1)

    return df

In [8]:
# Apply feature engineering
combined_df = feature_engineering(combined_df)

# Update numerical features to include new features
numerical_features = ['Age', 'Tenure', 'Starting Salary', 'Current Salary',
                      'Years of Experience', 'Average Monthly Working Hours',
                      'Months in Role', 'Promotion History', 'Last Performance Review Score',
                      'Salary Percentage Change', 'Salary Raise Per Year', 'Promotion Frequency']

# Identify categorical features
categorical_features = ['Gender', 'Role', 'Department', 'Location', 'Contract']

# Separate features and target variable
X = combined_df.drop('Turnover', axis=1)
y = combined_df['Turnover']

# Split back into train and test sets
X_train = X.iloc[:len(train_df)]
X_test = X.iloc[len(train_df):]
y_train = y.iloc[:len(train_df)]
y_test = y.iloc[len(train_df):]