In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"D:\Data Science\iNeuron\ML\Regression\Data Science Salaries\data\Latest_Data_Science_Salaries.csv")

In [3]:
df.head()

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,Data Engineer,Full-Time,Senior,Expert,210000,United States Dollar,United States,210000,United States,Medium,2023
1,Data Engineer,Full-Time,Senior,Expert,165000,United States Dollar,United States,165000,United States,Medium,2023
2,Data Engineer,Full-Time,Senior,Expert,185900,United States Dollar,United States,185900,United States,Medium,2023
3,Data Engineer,Full-Time,Senior,Expert,129300,United States Dollar,United States,129300,United States,Medium,2023
4,Data Scientist,Full-Time,Senior,Expert,140000,United States Dollar,United States,140000,United States,Medium,2023


In [4]:
df = df.drop(['Job Title', 'Salary', 'Salary Currency'], axis=1)
df['Working Site'] = ['Yes' if i else 'No' for i in df['Company Location'] == df['Employee Residence']]
df['Working Site'].unique()
df = df.drop(['Company Location', 'Employee Residence'], axis=1)

In [5]:
X = df.drop('Salary in USD', axis=1)
y = df['Salary in USD']

In [6]:
categorical_cols = X.columns[X.dtypes == "object"]
numerical_cols = X.columns[X.dtypes != "object"]

In [7]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
Employment_Type_cat = ['Part-Time', 'Freelance', 'Contract', 'Full-Time']
Experience_Level_cat = ['Entry', 'Mid', 'Senior', 'Executive']
Expertise_Level_cat = ['Junior', 'Intermediate', 'Expert', 'Director'] 
Company_Size_cat = ['Small', 'Large', 'Medium']
Working_Site_cat = ['No', 'Yes']

In [9]:
# Numeric Pipeline
num_pipeline = Pipeline(
    steps=[
        # ('imputer', SimpleImputer(strategy='median')), # All values are Non Null
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        # ('imputer', SimpleImputer(strategy='most frequent')), # All values are Non Null
        ('OrdinalEncoder', OrdinalEncoder()),
        ('scaler', StandardScaler())

    ]
)

preprocessor = ColumnTransformer([
    ("numeric", num_pipeline, numerical_cols),  # apply the numerical pipeline to these columns.
    ("categorical", cat_pipeline, categorical_cols)   # Apply the Categorical Encoder and then scale
])

In [10]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [11]:
X_train.head()

Unnamed: 0,Employment Type,Experience Level,Expertise Level,Company Size,Year,Working Site
3224,Full-Time,Mid,Intermediate,Medium,2021,No
1117,Full-Time,Senior,Expert,Medium,2023,Yes
2611,Full-Time,Mid,Intermediate,Medium,2022,Yes
2097,Full-Time,Senior,Expert,Medium,2022,Yes
662,Full-Time,Mid,Intermediate,Large,2023,Yes


In [12]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [13]:
X_train.head()

Unnamed: 0,numeric__Year,categorical__Employment Type,categorical__Experience Level,categorical__Expertise Level,categorical__Company Size,categorical__Working Site
0,-2.041735,0.042909,-0.414017,0.862404,0.207499,-5.535687
1,0.716398,0.042909,0.649809,-0.535598,0.207499,0.180646
2,-0.662668,0.042909,-0.414017,0.862404,0.207499,0.180646
3,-0.662668,0.042909,0.649809,-0.535598,0.207499,0.180646
4,0.716398,0.042909,-0.414017,0.862404,-2.225615,0.180646


In [14]:
X_test.head()

Unnamed: 0,numeric__Year,categorical__Employment Type,categorical__Experience Level,categorical__Expertise Level,categorical__Company Size,categorical__Working Site
0,0.716398,0.042909,0.649809,-0.535598,0.207499,0.180646
1,0.716398,0.042909,0.649809,-0.535598,0.207499,0.180646
2,0.716398,0.042909,-2.54167,2.260407,-2.225615,0.180646
3,-0.662668,0.042909,-0.414017,0.862404,0.207499,0.180646
4,0.716398,0.042909,0.649809,-0.535598,0.207499,0.180646


In [15]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [16]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [17]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    # 'NaiveBias':MultinomialNB(),
    'KNNR':KNeighborsRegressor(n_neighbors=5),
    'DecisionTree':DecisionTreeRegressor(random_state=42),
    'SVR linear':SVR(kernel='linear'),
    'SVR rbf':SVR(kernel='rbf'),
    'RandomForest':RandomForestRegressor(random_state=42),
    'AdaBoost':AdaBoostRegressor(),
    'Gradient Boosting':GradientBoostingRegressor(),
    'XGB':xgb.XGBRegressor(),
    'BaggingSVR':BaggingRegressor(estimator=SVR())
}

trained_model_list = {'Model_Name':[], 'Model': [], 'R2_Score': []}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    trained_model_list['Model_Name'].append(list(models.keys())[i])
    trained_model_list['Model'].append(model)
    trained_model_list['R2_Score'].append(r2_square*100)

    # trained_model_list.update({'Model_Name': list(models.keys())[i], 'Model': model, 'R2_Score': r2_square*100})

    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 61410.44559383913
MAE: 47564.51510960421
R2 score 19.89908421066723


Lasso
Model Training Performance
RMSE: 61410.27304081396
MAE: 47564.45997792245
R2 score 19.899534350241588


Ridge
Model Training Performance
RMSE: 61409.22527336859
MAE: 47563.927748689835
R2 score 19.902267637040126


Elasticnet
Model Training Performance
RMSE: 61750.75259137162
MAE: 48053.16728390521
R2 score 19.00886334626639


KNNR
Model Training Performance
RMSE: 67425.66694782928
MAE: 53438.5107070707
R2 score 3.4386150373531876


DecisionTree
Model Training Performance
RMSE: 62768.393811546724
MAE: 48143.898411266484
R2 score 16.317428883971985


SVR linear
Model Training Performance
RMSE: 68235.79253127197
MAE: 53403.75259855658
R2 score 1.1042873761413885


SVR rbf
Model Training Performance
RMSE: 68869.26732739345
MAE: 53971.17144434608
R2 score -0.740455352767122


RandomForest
Model Training Performance
RMSE: 61940.61508699161
MAE: 47804.65873427545
R2 s

In [18]:
pd.DataFrame(trained_model_list)

Unnamed: 0,Model_Name,Model,R2_Score
0,LinearRegression,LinearRegression(),19.899084
1,Lasso,Lasso(),19.899534
2,Ridge,Ridge(),19.902268
3,Elasticnet,ElasticNet(),19.008863
4,KNNR,KNeighborsRegressor(),3.438615
5,DecisionTree,DecisionTreeRegressor(random_state=42),16.317429
6,SVR linear,SVR(kernel='linear'),1.104287
7,SVR rbf,SVR(),-0.740455
8,RandomForest,"(DecisionTreeRegressor(max_features=1.0, rando...",18.510057
9,AdaBoost,"(DecisionTreeRegressor(max_depth=3, random_sta...",12.121658
