In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle


In [2]:
file_path = 'C:/Users/aarti/Downloads/Salary Prediction of Data Professions.csv'
data = pd.read_csv(file_path)

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2639 entries, 0 to 2638
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   FIRST NAME        2639 non-null   object 
 1   LAST NAME         2637 non-null   object 
 2   SEX               2639 non-null   object 
 3   DOJ               2638 non-null   object 
 4   CURRENT DATE      2639 non-null   object 
 5   DESIGNATION       2639 non-null   object 
 6   AGE               2636 non-null   float64
 7   SALARY            2639 non-null   int64  
 8   UNIT              2639 non-null   object 
 9   LEAVES USED       2636 non-null   float64
 10  LEAVES REMAINING  2637 non-null   float64
 11  RATINGS           2637 non-null   float64
 12  PAST EXP          2639 non-null   int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 268.2+ KB
None


In [4]:
print(data.describe())

               AGE         SALARY  LEAVES USED  LEAVES REMAINING      RATINGS  \
count  2636.000000    2639.000000  2636.000000       2637.000000  2637.000000   
mean     24.756449   58136.678287    22.501517          7.503223     3.486159   
std       3.908228   36876.956944     4.604469          4.603193     1.114933   
min      21.000000   40001.000000    15.000000          0.000000     2.000000   
25%      22.000000   43418.000000    19.000000          4.000000     2.000000   
50%      24.000000   46781.000000    22.000000          8.000000     3.000000   
75%      25.000000   51401.500000    26.000000         11.000000     4.000000   
max      45.000000  388112.000000    30.000000         15.000000     5.000000   

          PAST EXP  
count  2639.000000  
mean      1.566881  
std       2.728416  
min       0.000000  
25%       0.000000  
50%       1.000000  
75%       2.000000  
max      23.000000  


In [None]:
#handle missing values



In [5]:
imputer = SimpleImputer(strategy='mean')
data['AGE'] = imputer.fit_transform(data[['AGE']])
data['LEAVES USED'] = imputer.fit_transform(data[['LEAVES USED']])
data['RATINGS'] = imputer.fit_transform(data[['RATINGS']])

In [6]:
data = data.dropna()

In [None]:
#encode categorical variables

In [8]:
label_encoder = LabelEncoder()
data['SEX'] = label_encoder.fit_transform(data['SEX'])
data['DESIGNATION'] = label_encoder.fit_transform(data['DESIGNATION'])
data['UNIT'] = label_encoder.fit_transform(data['UNIT'])

In [None]:
#extract years of experience from DOJ

In [10]:
data['DOJ'] = pd.to_datetime(data['DOJ'], errors='coerce')
data['CURRENT DATE'] = pd.to_datetime(data['CURRENT DATE'], errors='coerce')
data['EXPERIENCE'] = (data['CURRENT DATE'] - data['DOJ']).dt.days / 365

In [11]:
data = data.drop(columns=['FIRST NAME', 'LAST NAME', 'DOJ', 'CURRENT DATE'])

In [None]:
#define features and target variable

In [12]:
X = data.drop(columns=['SALARY'])
y = data['SALARY']

In [None]:
#split the data 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#initializw models

In [15]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

In [None]:
#train and evaluate models

In [16]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Performance:")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}")
    print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}")
    print(f"Root Mean Squared Error (RMSE): {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"R-squared (R2): {r2_score(y_test, y_pred)}\n")


Linear Regression Performance:
Mean Absolute Error (MAE): 11143.528027190285
Mean Squared Error (MSE): 359466072.32799834
Root Mean Squared Error (RMSE): 18959.590510556875
R-squared (R2): 0.8157081503462168

Decision Tree Performance:
Mean Absolute Error (MAE): 5213.299810246679
Mean Squared Error (MSE): 109344383.4516129
Root Mean Squared Error (RMSE): 10456.78647824526
R-squared (R2): 0.943941083104046

Random Forest Performance:
Mean Absolute Error (MAE): 4338.528178368121
Mean Squared Error (MSE): 76199043.26424882
Root Mean Squared Error (RMSE): 8729.20633644599
R-squared (R2): 0.9609341083733668

Gradient Boosting Performance:
Mean Absolute Error (MAE): 4762.92225596887
Mean Squared Error (MSE): 106354338.43250021
Root Mean Squared Error (RMSE): 10312.823979516968
R-squared (R2): 0.9454740259032138



In [None]:
#save the best model

In [24]:
best_model = GradientBoostingRegressor()
best_model.fit(X_train, y_train)
pickle.dump(best_model, open('salary_prediction_model.pkl', 'wb'))


In [25]:
# Load the model and make predictions
loaded_model = pickle.load(open('salary_prediction_model.pkl', 'rb'))
sample_data = X_test[0].reshape(1, -1)
predicted_salary = loaded_model.predict(sample_data)
print(f"Predicted Salary: {predicted_salary[0]}")


Predicted Salary: 112559.04793527926
