In [108]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [110]:
# Import the main training dataset
df = pd.read_csv('employee_Data.csv', encoding='ISO-8859-1')

# Check the shape of the dataset after duplication
print(df.shape)

# Optional: Preview the first few rows
df.head()

(20340, 7)


Unnamed: 0,Over_time,No_of_workers,Department,SMV,Idle_time,Idle_men,Tenure
0,960,8.0,Legal and Compliance,3.94,0.0,0,2
1,7080,59.0,Sales and Marketing,30.1,0.0,0,3
2,1440,7.0,Product Management,4.15,0.0,0,4
3,5040,42.0,Research and Development,22.53,0.0,0,1
4,3300,57.0,Quality Assurance,30.1,0.0,0,3


In [112]:
# Check for missing values
df.isnull().sum()

# Basic statistics of the dataset
df.describe()


Unnamed: 0,Over_time,No_of_workers,SMV,Idle_time,Idle_men,Tenure
count,20340.0,20340.0,20340.0,20340.0,20340.0,20340.0
mean,4532.94002,34.846116,15.150492,0.564405,0.39823,2.413963
std,3274.466813,22.174927,10.940982,10.089015,3.350146,1.16098
min,0.0,2.0,2.9,0.0,0.0,1.0
25%,1440.0,9.0,3.94,0.0,0.0,1.0
50%,4080.0,34.0,15.26,0.0,0.0,2.0
75%,6900.0,57.0,24.26,0.0,0.0,4.0
max,15120.0,89.0,54.56,270.0,45.0,4.0


In [118]:
# Normalize 'Over_time' to a percentage scale (0 to 100%)
max_over_time = df['Over_time'].max()
df['performance_percentage'] = (df['Over_time'] / max_over_time) * 100


In [120]:
# One-hot encode 'Department' and scale numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['No_of_workers', 'SMV', 'Idle_time', 'Idle_men', 'Tenure']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Department'])
    ])


In [122]:
# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the sizes of the training and testing sets
print(f"Training set size: {X_train.shape}, Testing set size: {X_test.shape}")


Training set size: (16272, 6), Testing set size: (4068, 6)


In [124]:
# Define the model pipeline using RandomForestRegressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train the model
model.fit(X_train, y_train)


In [126]:
# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (Percentage): {rmse}")

# Calculate R-squared (Accuracy for Percentage Prediction)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (Accuracy for Percentage Prediction): {r2}")


Root Mean Squared Error (Percentage): 8.55247125187027
R-squared (Accuracy for Percentage Prediction): 0.8420077926287042


In [128]:
# Save the trained model as a Pickle file
model_filename = 'employee_performance_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved as {model_filename}")

# Load the saved model (Pickle file)
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)


Model saved as employee_performance_model.pkl


In [130]:
# Use the test data (X_test) to predict the performance for the first 100 users
sample_data = X_test.iloc[0:100]  # Select the first 100 users' data for prediction
predicted_performance = loaded_model.predict(sample_data)

# Step 4: Create a table with actual and predicted performance percentages for the first 100 users
results_df = pd.DataFrame({
    'Actual Performance Percentage': y_test.iloc[0:100].values,  # Actual values
    'Predicted Performance Percentage': predicted_performance    # Predicted values
})

# Display the results table
print(results_df)


    Actual Performance Percentage  Predicted Performance Percentage
0                       30.158730                         28.779649
1                       67.261905                         54.510258
2                        9.523810                         25.283633
3                        9.523810                          8.331296
4                        7.936508                         11.085345
..                            ...                               ...
95                       3.968254                         41.700738
96                       9.523810                          7.671140
97                      46.825397                         57.269227
98                      46.825397                         46.833153
99                       7.936508                          7.928571

[100 rows x 2 columns]


In [132]:
# Step 1: Import necessary libraries
import pandas as pd
import pickle

# Step 2: Load the saved model (Pickle file)
model_filename = 'employee_performance_model.pkl'  # Replace with your actual file path
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Step 3: Use the test data (X_test) to predict the performance for the first 100 users
# Ensure that X_test is available from the previous steps of your workflow
sample_data = X_test.iloc[0:100]  # Select the first 100 users' data for prediction
predicted_performance = loaded_model.predict(sample_data)

# Step 4: Create a table with actual and predicted performance percentages for the first 100 users
results_df = pd.DataFrame({
    'Actual Performance Percentage': y_test.iloc[0:100].values,  # Actual values
    'Predicted Performance Percentage': predicted_performance    # Predicted values
})

# Step 5: Change pandas display options to show all rows
pd.set_option('display.max_rows', None)

# Step 6: Display the results table
print(results_df)

# Step 7: Reset display option to default (optional)
pd.reset_option('display.max_rows')


    Actual Performance Percentage  Predicted Performance Percentage
0                       30.158730                         28.779649
1                       67.261905                         54.510258
2                        9.523810                         25.283633
3                        9.523810                          8.331296
4                        7.936508                         11.085345
5                       44.444444                         47.976234
6                       26.984127                         18.008753
7                       45.634921                         52.673564
8                        9.523810                         11.693502
9                        9.523810                          7.747223
10                      38.095238                         38.095238
11                      38.888889                         39.280291
12                      12.698413                          7.885714
13                      11.904762               