In [357]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [358]:
df = pd.read_csv('employee_productivity.csv')
df.head()

Unnamed: 0,employee_id,login_time,logout_time,total_tasks_completed,weekly_absences,productivity_score
0,1,9,16,65,2,56
1,2,9,17,82,1,54
2,3,10,17,76,2,62
3,4,10,17,110,2,74
4,5,8,14,97,1,58


In [359]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   employee_id            3000 non-null   int64
 1   login_time             3000 non-null   int64
 2   logout_time            3000 non-null   int64
 3   total_tasks_completed  3000 non-null   int64
 4   weekly_absences        3000 non-null   int64
 5   productivity_score     3000 non-null   int64
dtypes: int64(6)
memory usage: 140.8 KB


In [360]:
print("Duplicated rows:", df.duplicated().sum())

Duplicated rows: 0


In [361]:
print("Missing Values:")

print(df.isnull().sum())

Missing Values:
employee_id              0
login_time               0
logout_time              0
total_tasks_completed    0
weekly_absences          0
productivity_score       0
dtype: int64


The Data Provided is already Clean , Like all the Values feel like they were generated using random function and Have no issues now As the assignment Says we have to generate new Features using the Existing columns

### I dont think the employe ID is helping at all so we are Removing it 

In [362]:
# remove the employee_id column as it is not useful for analysis

df = df.drop(columns=['employee_id'])

## Let's Train the Base Model

In [363]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor, Ridge

scaler = StandardScaler()

X = df.drop(columns=['productivity_score'])
y = df['productivity_score']

In [364]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [365]:
pipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('sgd_regressor', SGDRegressor(max_iter=1000, tol=1e-3))])

ridge_pipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('ridge_regressor', Ridge(alpha=1.0))])

In [366]:
pipe.fit(X_train_scaled, y_train)
y_pred = pipe.predict(X_test_scaled)

ridge_pipe.fit(X_train_scaled, y_train)
y_ridge_pred = ridge_pipe.predict(X_test_scaled)

In [367]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")


print("\nRidge Regression Results:")

mse_ridge = mean_squared_error(y_test, y_ridge_pred)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mean_absolute_error(y_test, y_ridge_pred)
r2_ridge = r2_score(y_test, y_ridge_pred)
print(f"Mean Squared Error: {mse_ridge:.2f}")
print(f"Root Mean Squared Error: {rmse_ridge:.2f}")
print(f"Mean Absolute Error: {mae_ridge:.2f}")
print(f"R^2 Score: {r2_ridge:.2f}")

Mean Squared Error: 8.66
Root Mean Squared Error: 2.94
Mean Absolute Error: 2.46
R^2 Score: 0.91

Ridge Regression Results:
Mean Squared Error: 8.63
Root Mean Squared Error: 2.94
Mean Absolute Error: 2.45
R^2 Score: 0.91


### SO Yeah now Let's Focus on creating new Features see the above result's it is Mandatory to do so

In [368]:
from sklearn.base import BaseEstimator, TransformerMixin

In [369]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   login_time             3000 non-null   int64
 1   logout_time            3000 non-null   int64
 2   total_tasks_completed  3000 non-null   int64
 3   weekly_absences        3000 non-null   int64
 4   productivity_score     3000 non-null   int64
dtypes: int64(5)
memory usage: 117.3 KB


In [370]:
X = df.drop(columns=['productivity_score'])
y = df['productivity_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [371]:
X_train.head()

Unnamed: 0,login_time,logout_time,total_tasks_completed,weekly_absences
642,10,17,63,1
700,9,15,60,3
226,9,19,108,2
1697,8,15,78,1
1010,9,16,83,3


In [372]:
class FeatureCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        X_transformed = X.copy()
        
        # 1. Feature: Daily Work Hours
        X_transformed['daily_work_hours'] = X_transformed['logout_time'] - X_transformed['login_time']
        X_transformed['daily_work_hours'] = np.maximum(X_transformed['daily_work_hours'], 1) 
        
        # 2. Feature: Tasks Per Hour (Efficiency)
        X_transformed['tasks_per_hour'] = X_transformed['total_tasks_completed'] / X_transformed['daily_work_hours']
        
        # 3. Feature: Absenteeism Rate
        X_transformed['absenteeism_rate'] = X_transformed['weekly_absences'] / 5.0 # Assuming 5-day week
        
        return X_transformed[['daily_work_hours', 'tasks_per_hour', 'absenteeism_rate', 'total_tasks_completed', 'weekly_absences']]

In [373]:
pipe = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('sgd_regressor', SGDRegressor(max_iter=1000, tol=1e-3))])



ridge_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regressor', SGDRegressor(alpha=0.1, max_iter=1000, tol=1e-3))           # Step 2: Use a regularized linear model
])

X_train = FeatureCreator().fit_transform(X_train)
X_test = FeatureCreator().fit_transform(X_test)


ridge_pipeline.fit(X_train, y_train)
y_ridge_final_pred = ridge_pipeline.predict(X_test)


sgd_model = pipe.fit(X_train, y_train)
y_sgd_final_pred = sgd_model.predict(X_test)

In [374]:
mse = mean_squared_error(y_test, y_ridge_final_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_ridge_final_pred)
r2 = r2_score(y_test, y_ridge_final_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")


print("SGDRegressor with Feature Engineering, Scaling, and PCA")
mse = mean_squared_error(y_test, y_sgd_final_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_sgd_final_pred)
r2 = r2_score(y_test, y_sgd_final_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R^2 Score: {r2:.2f}")


Mean Squared Error: 9.10
Root Mean Squared Error: 3.02
Mean Absolute Error: 2.51
R^2 Score: 0.91
SGDRegressor with Feature Engineering, Scaling, and PCA
Mean Squared Error: 8.61
Root Mean Squared Error: 2.94
Mean Absolute Error: 2.44
R^2 Score: 0.91
