In [16]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt 
# import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import joblib 


In [2]:
df = pd.read_csv('D:\Code\Project\employee_attrition_data.csv')
df.head()

Unnamed: 0,Employee_ID,Age,Gender,Department,Job_Title,Years_at_Company,Satisfaction_Level,Average_Monthly_Hours,Promotion_Last_5Years,Salary,Attrition
0,0,27,Male,Marketing,Manager,9,0.586251,151,0,60132,0
1,1,53,Female,Sales,Engineer,10,0.261161,221,1,79947,0
2,2,59,Female,Marketing,Analyst,8,0.304382,184,0,46958,1
3,3,42,Female,Engineering,Manager,1,0.480779,242,0,40662,0
4,4,44,Female,Sales,Engineer,10,0.636244,229,1,74307,0


In [3]:
df.isnull().sum()

Employee_ID              0
Age                      0
Gender                   0
Department               0
Job_Title                0
Years_at_Company         0
Satisfaction_Level       0
Average_Monthly_Hours    0
Promotion_Last_5Years    0
Salary                   0
Attrition                0
dtype: int64

In [4]:
df.info()
print("-" * 60)
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Employee_ID            1000 non-null   int64  
 1   Age                    1000 non-null   int64  
 2   Gender                 1000 non-null   object 
 3   Department             1000 non-null   object 
 4   Job_Title              1000 non-null   object 
 5   Years_at_Company       1000 non-null   int64  
 6   Satisfaction_Level     1000 non-null   float64
 7   Average_Monthly_Hours  1000 non-null   int64  
 8   Promotion_Last_5Years  1000 non-null   int64  
 9   Salary                 1000 non-null   int64  
 10  Attrition              1000 non-null   int64  
dtypes: float64(1), int64(7), object(3)
memory usage: 86.1+ KB
------------------------------------------------------------


Unnamed: 0,Employee_ID,Age,Years_at_Company,Satisfaction_Level,Average_Monthly_Hours,Promotion_Last_5Years,Salary,Attrition
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,499.5,42.205,5.605,0.505995,199.493,0.486,64624.98,0.495
std,288.819436,10.016452,2.822223,0.289797,29.631908,0.500054,20262.984333,0.500225
min,0.0,25.0,1.0,0.001376,150.0,0.0,30099.0,0.0
25%,249.75,33.0,3.0,0.258866,173.0,0.0,47613.5,0.0
50%,499.5,43.0,6.0,0.505675,201.0,0.0,64525.0,0.0
75%,749.25,51.0,8.0,0.761135,225.0,1.0,81921.0,1.0
max,999.0,59.0,10.0,0.999979,249.0,1.0,99991.0,1.0


In [5]:
df.duplicated().sum()

0

In [6]:
df.drop(columns = 'Employee_ID', inplace = True)

In [7]:
df.columns

Index(['Age', 'Gender', 'Department', 'Job_Title', 'Years_at_Company',
       'Satisfaction_Level', 'Average_Monthly_Hours', 'Promotion_Last_5Years',
       'Salary', 'Attrition'],
      dtype='object')

In [8]:
columns_to_count = ['Department', 'Gender', 'Job_Title','Promotion_Last_5Years','Attrition']

for column in columns_to_count:
    if column in df.columns:  
        print(f"Value counts for column '{column}':")
        print(df[column].value_counts())
        print()  # Add a blank line for readability
    else:
        print(f"Column '{column}' does not exist in the DataFrame.")

Value counts for column 'Department':
Department
Sales          209
Finance        206
Engineering    204
HR             191
Marketing      190
Name: count, dtype: int64

Value counts for column 'Gender':
Gender
Male      506
Female    494
Name: count, dtype: int64

Value counts for column 'Job_Title':
Job_Title
Engineer         214
Manager          206
Accountant       206
Analyst          195
HR Specialist    179
Name: count, dtype: int64

Value counts for column 'Promotion_Last_5Years':
Promotion_Last_5Years
0    514
1    486
Name: count, dtype: int64

Value counts for column 'Attrition':
Attrition
0    505
1    495
Name: count, dtype: int64



In [9]:
X = df[['Years_at_Company','Average_Monthly_Hours','Satisfaction_Level']]
y = df[['Salary']]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

In [10]:
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)

In [11]:
X_test = scaler.fit_transform(X_test)

In [12]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [13]:
def results(predictions):
    mae = mean_absolute_error(y_test, predictions)
    print("Mean absolute error on model is {}".format(mae))
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    print("Root mean squared error on model is {}".format(rmse))

In [17]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Support Vector Machine": SVR()
}


for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)  
    predictions = model.predict(X_test)  
    print(f"Results for {name}:")
    results(predictions)  

Training Linear Regression...
Results for Linear Regression:
Mean absolute error on model is 18465.74543095977
Root mean squared error on model is 21136.75693788642
Training Random Forest...


  return fit_method(estimator, *args, **kwargs)


Results for Random Forest:
Mean absolute error on model is 19583.270700000005
Root mean squared error on model is 23086.74331808653
Training Decision Tree...
Results for Decision Tree:
Mean absolute error on model is 25951.035
Root mean squared error on model is 31340.938039328048
Training Support Vector Machine...
Results for Support Vector Machine:
Mean absolute error on model is 18443.659975735474
Root mean squared error on model is 21097.975164069063


  y = column_or_1d(y, warn=True)


In [None]:
svm_model = SVR()
svm_model.fit(X_train, y_train)

# Train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Save the models
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(linear_model, 'linear_regression_model.pkl')

print("Models trained and saved successfully!")

Models saved successfully!
