In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load and preprocessing

In [2]:
data = pd.read_csv('Salary.csv')
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
...,...,...,...,...,...,...,...,...,...
6679,49.0,Female,3,Director of Marketing,20.0,200000.0,UK,Mixed,0
6680,32.0,Male,0,Sales Associate,3.0,50000.0,Australia,Australian,0
6681,30.0,Female,1,Financial Manager,4.0,55000.0,China,Chinese,0
6682,46.0,Male,2,Marketing Manager,14.0,140000.0,China,Korean,0


In [3]:
data.drop_duplicates(inplace = True)
data.shape

(5148, 9)

In [4]:
data.isnull().sum()

Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
Country                0
Race                   0
Senior                 0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5148 entries, 0 to 6682
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  5148 non-null   float64
 1   Gender               5148 non-null   object 
 2   Education Level      5148 non-null   int64  
 3   Job Title            5148 non-null   object 
 4   Years of Experience  5148 non-null   float64
 5   Salary               5148 non-null   float64
 6   Country              5148 non-null   object 
 7   Race                 5148 non-null   object 
 8   Senior               5148 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 402.2+ KB


In [6]:
data['Gender'].value_counts()

Gender
Male      2808
Female    2340
Name: count, dtype: int64

In [7]:
data.replace({'Male' : 1 , 'Female' : 0} , inplace=True)

In [8]:
data['Gender'] = pd.to_numeric(data['Gender'])

In [9]:
y = data['Salary']
data.drop('Salary' , axis=1 , inplace=True)

In [10]:
data_numeric = data.select_dtypes(['int64' , 'float64'])
data_numeric

Unnamed: 0,Age,Gender,Education Level,Years of Experience,Senior
0,32.0,1,1,5.0,0
1,28.0,0,2,3.0,0
2,45.0,1,3,15.0,1
3,36.0,0,1,7.0,0
4,52.0,1,2,20.0,0
...,...,...,...,...,...
6678,37.0,1,1,6.0,0
6679,49.0,0,3,20.0,0
6680,32.0,1,0,3.0,0
6681,30.0,0,1,4.0,0


In [11]:
data_object = data.select_dtypes('object')
data_object

Unnamed: 0,Job Title,Country,Race
0,Software Engineer,UK,White
1,Data Analyst,USA,Hispanic
2,Manager,Canada,White
3,Sales Associate,USA,Hispanic
4,Director,USA,Asian
...,...,...,...
6678,Sales Representative,Canada,Asian
6679,Director of Marketing,UK,Mixed
6680,Sales Associate,Australia,Australian
6681,Financial Manager,China,Chinese


In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
enc = LabelEncoder()
data_object = data_object.apply(enc.fit_transform)
data_object

Unnamed: 0,Job Title,Country,Race
0,112,3,9
1,24,4,5
2,72,1,9
3,100,4,5
4,34,4,1
...,...,...,...
6678,105,1,1
6679,42,3,7
6680,100,0,2
6681,51,2,4


In [14]:
X = np.concatenate([data_numeric,data_object], axis=1)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,32.0,1.0,1.0,5.0,0.0,112.0,3.0,9.0
1,28.0,0.0,2.0,3.0,0.0,24.0,4.0,5.0
2,45.0,1.0,3.0,15.0,1.0,72.0,1.0,9.0
3,36.0,0.0,1.0,7.0,0.0,100.0,4.0,5.0
4,52.0,1.0,2.0,20.0,0.0,34.0,4.0,1.0
...,...,...,...,...,...,...,...,...
5143,37.0,1.0,1.0,6.0,0.0,105.0,1.0,1.0
5144,49.0,0.0,3.0,20.0,0.0,42.0,3.0,7.0
5145,32.0,1.0,0.0,3.0,0.0,100.0,0.0,2.0
5146,30.0,0.0,1.0,4.0,0.0,51.0,2.0,4.0


In [15]:
from sklearn.preprocessing import MinMaxScaler

In [16]:
scaler = MinMaxScaler()

In [17]:
X = scaler.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.268293,1.0,0.333333,0.147059,0.0,0.875000,0.75,1.000000
1,0.170732,0.0,0.666667,0.088235,0.0,0.187500,1.00,0.555556
2,0.585366,1.0,1.000000,0.441176,1.0,0.562500,0.25,1.000000
3,0.365854,0.0,0.333333,0.205882,0.0,0.781250,1.00,0.555556
4,0.756098,1.0,0.666667,0.588235,0.0,0.265625,1.00,0.111111
...,...,...,...,...,...,...,...,...
5143,0.390244,1.0,0.333333,0.176471,0.0,0.820312,0.25,0.111111
5144,0.682927,0.0,1.000000,0.588235,0.0,0.328125,0.75,0.777778
5145,0.268293,1.0,0.000000,0.088235,0.0,0.781250,0.00,0.222222
5146,0.219512,0.0,0.333333,0.117647,0.0,0.398438,0.50,0.444444


# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [20]:
lin =LinearRegression()
lin.fit(X_train, y_train)

In [21]:
lin.score(X_train, y_train)

0.711431423210663

In [22]:
lin.score(X_test, y_test)

0.7386380555936436

In [23]:
y_pred = lin.predict(X_test)

In [24]:
metrics.mean_absolute_error(y_pred , y_test)

20888.28026520387

# Nonlinear Regression

In [25]:
from sklearn.preprocessing import PolynomialFeatures

In [47]:
poly = PolynomialFeatures(degree=4)
X_poly = poly.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=30)

In [49]:
nonlin = LinearRegression()

In [50]:
nonlin.fit(X_train, y_train)

In [51]:
nonlin.score(X_train, y_train)

0.8779868100064432

In [52]:
nonlin.score(X_test, y_test)

0.8456454829036475

In [53]:
y_pred = nonlin.predict(X_test)

In [54]:
metrics.mean_absolute_error(y_pred , y_test)

14987.257356121017

# Random Forest Regression

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)

In [36]:
rf = RandomForestRegressor()

In [37]:
rf.fit(X_train, y_train)

In [38]:
rf.score(X_train, y_train)

0.991821992511171

In [39]:
rf.score(X_test, y_test)

0.9571492065429524

In [40]:
y_pred = rf.predict(X_test)

In [41]:
metrics.mean_absolute_error(y_pred , y_test)

5306.380296840807

# Cross Validate on RF

In [42]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [43]:
folds = KFold(n_splits=4, shuffle=True, random_state=30)

In [44]:
cv = cross_validate(rf, X, y, cv=folds, return_train_score=True)

In [45]:
cv['train_score'].mean()

0.9923237922995976

In [46]:
cv['test_score'].mean()

0.9540739079992484