## Import Libraries 

In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import pointbiserialr, spearmanr
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder

##  Load Data

In [274]:
data = pd.read_excel('Salaries.xlsx')
data.columns = ['Year','Experience','Type','Title','Residence','Salary','Currency',
                'Remote work','Company location','Company size','Salary in USD']

In [275]:
data   

Unnamed: 0,Year,Experience,Type,Title,Residence,Salary,Currency,Remote work,Company location,Company size,Salary in USD
0,2020,Intermediate,Full-time,Data Scientist,DE,70000,EUR,No,DE,L,79833
1,2020,Senior,Full-time,Machine Learning Scientist,JP,260000,USD,No,JP,S,260000
2,2020,Senior,Full-time,Big Data Engineer,GB,85000,GBP,Partially,GB,M,109024
3,2020,Intermediate,Full-time,Product Data Analyst,HN,20000,USD,No,HN,S,20000
4,2020,Senior,Full-time,Machine Learning Engineer,US,150000,USD,Partially,US,L,150000
...,...,...,...,...,...,...,...,...,...,...,...
602,2022,Senior,Full-time,Data Engineer,US,154000,USD,Yes,US,M,154000
603,2022,Senior,Full-time,Data Engineer,US,126000,USD,Yes,US,M,126000
604,2022,Senior,Full-time,Data Analyst,US,129000,USD,No,US,M,129000
605,2022,Senior,Full-time,Data Analyst,US,150000,USD,Yes,US,M,150000


## Data Analysis 

In [276]:
data.isnull().sum()

Year                0
Experience          0
Type                0
Title               0
Residence           0
Salary              0
Currency            0
Remote work         0
Company location    0
Company size        0
Salary in USD       0
dtype: int64

In [277]:
average_sal = df["Salary in USD"].mean()
print('Total number of records', n_records)
print('Average salary is', average_sal)

Total number of records 607
Average salary is 112297.86985172982


In [278]:
data.shape

(607, 11)

## One Hot Encoding 

In [279]:
data.describe()

Unnamed: 0,Year,Salary,Salary in USD
count,607.0,607.0,607.0
mean,2021.405272,324000.1,112297.869852
std,0.692133,1544357.0,70957.259411
min,2020.0,4000.0,2859.0
25%,2021.0,70000.0,62726.0
50%,2022.0,115000.0,101570.0
75%,2022.0,165000.0,150000.0
max,2022.0,30400000.0,600000.0


In [280]:
category_col = ['Experience','Type','Title','Residence','Currency','Remote work','Company location','Company size']

In [281]:
for c in category_col:
    print(c)
    print(df[c].value_counts())

Experience
Senior          280
Intermediate    213
Junior           88
Executive        26
Name: Experience, dtype: int64
Type
Full-time    588
Part-time     10
Temporary      5
Freelance      4
Name: Type, dtype: int64
Title
Data Scientist                              143
Data Engineer                               132
Data Analyst                                 97
Machine Learning Engineer                    41
Research Scientist                           16
Data Science Manager                         12
Data Architect                               11
Big Data Engineer                             8
Machine Learning Scientist                    8
Principal Data Scientist                      7
AI Scientist                                  7
Data Science Consultant                       7
Director of Data Science                      7
Data Analytics Manager                        7
ML Engineer                                   6
Computer Vision Engineer                      6
BI Dat

## Normalization

In [282]:
# Split the data into features ans target label
salary_raw = data['Salary in USD'] 
features_raw = data.drop('Salary in USD', axis = 1)

In [283]:
# Log transformation 
features_log_transformed = pd.DataFrame(data = features_raw)

In [284]:
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler()
numerical = ['Year', 'Salary']
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])
display(features_log_minmax_transform.head(n=5))

Unnamed: 0,Year,Experience,Type,Title,Residence,Salary,Currency,Remote work,Company location,Company size
0,0.0,Intermediate,Full-time,Data Scientist,DE,0.002171,EUR,No,DE,L
1,0.0,Senior,Full-time,Machine Learning Scientist,JP,0.008422,USD,No,JP,S
2,0.0,Senior,Full-time,Big Data Engineer,GB,0.002665,GBP,Partially,GB,M
3,0.0,Intermediate,Full-time,Product Data Analyst,HN,0.000526,USD,No,HN,S
4,0.0,Senior,Full-time,Machine Learning Engineer,US,0.004803,USD,Partially,US,L


In [285]:
features_final = pd.get_dummies(features_log_minmax_transform)
encoded = list(features_final.columns) 
print('{} total features after one hot encoding'. format(len(encoded)))
encoded

190 total features after one hot encoding


['Year',
 'Salary',
 'Experience_Executive',
 'Experience_Intermediate',
 'Experience_Junior',
 'Experience_Senior',
 'Type_Freelance',
 'Type_Full-time',
 'Type_Part-time',
 'Type_Temporary',
 'Title_3D Computer Vision Researcher',
 'Title_AI Scientist',
 'Title_Analytics Engineer',
 'Title_Applied Data Scientist',
 'Title_Applied Machine Learning Scientist',
 'Title_BI Data Analyst',
 'Title_Big Data Architect',
 'Title_Big Data Engineer',
 'Title_Business Data Analyst',
 'Title_Cloud Data Engineer',
 'Title_Computer Vision Engineer',
 'Title_Computer Vision Software Engineer',
 'Title_Data Analyst',
 'Title_Data Analytics Engineer',
 'Title_Data Analytics Lead',
 'Title_Data Analytics Manager',
 'Title_Data Architect',
 'Title_Data Engineer',
 'Title_Data Engineering Manager',
 'Title_Data Science Consultant',
 'Title_Data Science Engineer',
 'Title_Data Science Manager',
 'Title_Data Scientist',
 'Title_Data Specialist',
 'Title_Director of Data Engineering',
 'Title_Director of Da

In [286]:
for col in category_col:
    b, c = np.unique(data[col], return_inverse=True)
    data[col] = c
data.head()    

Unnamed: 0,Year,Experience,Type,Title,Residence,Salary,Currency,Remote work,Company location,Company size,Salary in USD
0,2020,1,1,22,14,70000,7,0,12,0,79833
1,2020,3,1,41,32,260000,16,0,29,2,260000
2,2020,3,1,7,20,85000,8,1,18,1,109024
3,2020,1,1,47,23,20000,16,0,20,2,20000
4,2020,3,1,38,55,150000,16,1,48,0,150000


## Univariate Feature Selection 

In [388]:
col_names = data.columns
param = []
correlation = []
abs_corr = []
for c in col_names:
    if c != 'Salary in USD':
        if len(data[c].unique()) <=2:
               corr = spearmanr(data['Salary in USD'], data[c])[0]  
        else:
               corr = pointbiserialr(data['Salary in USD'], data[c])[0]
               param.append(c)
               correlation.append(corr)
               abs_corr.append(abs(corr))
param_df = pd.DataFrame({'correlation':correlation, 'parameter':param, 'abs_corr':abs_corr})
param_df = param_df.sort_values(by=['abs_corr'], ascending=False)
param_df = param_df.set_index('parameter')
param_df       
                              

Unnamed: 0_level_0,correlation,abs_corr
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1
Residence,0.452501,0.452501
Company location,0.428994,0.428994
Currency,0.42055,0.42055
Experience,0.187896,0.187896
Year,0.170493,0.170493
Company size,-0.151205,0.151205
Remote work,0.132122,0.132122
Title,0.131016,0.131016
Salary,-0.083906,0.083906
Type,0.018543,0.018543


In [417]:
x = data.drop(columns=['Salary in USD'], axis=1)
y = data['Salary in USD']

In [418]:
x

Unnamed: 0,Year,Experience,Type,Title,Residence,Salary,Currency,Remote work,Company location,Company size
0,2020,1,1,22,14,70000,7,0,12,0
1,2020,3,1,41,32,260000,16,0,29,2
2,2020,3,1,7,20,85000,8,1,18,1
3,2020,1,1,47,23,20000,16,0,20,2
4,2020,3,1,38,55,150000,16,1,48,0
...,...,...,...,...,...,...,...,...,...,...
602,2022,3,1,17,55,154000,16,2,48,1
603,2022,3,1,17,55,126000,16,2,48,1
604,2022,3,1,12,55,129000,16,0,48,1
605,2022,3,1,12,55,150000,16,2,48,1


In [419]:
top_features = param_df.index[0:4]
print('top features:\t', top_features)

top features:	 Index(['Residence', 'Company location', 'Currency', 'Experience'], dtype='object', name='parameter')


## Train Test Split

In [420]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.20, random_state = 42)
#xtrain, xtest, ytrain, ytest = train_test_split(features_final, y, test_size = 0.20, random_state = 42)
print('Training set has {} samples.'.format(xtrain.shape[0]))
print('Test set has {} samples.'.format(xtest.shape[0]))

Training set has 485 samples.
Test set has 122 samples.


## Logistic Regression

In [421]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(solver='newton-cg')
logreg.fit(xtrain, ytrain)

ypred = logreg.predict(xtest)
ytrain_score = logreg.predict(xtrain)



In [422]:
#print('Accuracy of the model is:\nTest:', accuracy_score(ytest, ypred, normalize=False, sample_weight=None))
#print('Train:', accuracy_score(ytrain, ytrain_score, normalize=False, sample_weight=None))

## MAE

In [423]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

print('Mean Absolute Error:', mean_absolute_error(ytest, ypred))  
print('Mean Squared Error:', mean_squared_error(ytest, ypred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(ytest, ypred)))

Mean Absolute Error: 20263.44262295082
Mean Squared Error: 1427276558.557377
Root Mean Squared Error: 37779.31389738804


## Linear Regression

In [428]:
from sklearn.linear_model import LinearRegression 
linreg = LinearRegression()
linreg.fit(xtrain, ytrain)

LinearRegression()

In [429]:
ypred = linreg.predict(xtest)
ytrain_score = linreg.predict(xtrain)

## MAE

In [430]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

print('Mean Absolute Error:', mean_absolute_error(ytest, ypred))  
print('Mean Squared Error:', mean_squared_error(ytest, ypred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(ytest, ypred)))

Mean Absolute Error: 37724.59505448927
Mean Squared Error: 2848198330.764059
Root Mean Squared Error: 53368.51441406308
