In [54]:
import pandas as pd
import numpy as np

In [55]:
df = pd.read_csv("Salary_Data.csv")
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


Dropping duplicates if rows have all the same data in all columns

In [57]:
df.drop_duplicates(inplace=True)

Dropping NaN (not a number)

In [58]:
df.isna()
df.dropna()
print(df)

      Age  Gender Education Level                            Job Title  \
0    32.0    Male      Bachelor's                    Software Engineer   
1    28.0  Female        Master's                         Data Analyst   
2    45.0    Male             PhD                       Senior Manager   
3    36.0  Female      Bachelor's                      Sales Associate   
4    52.0    Male        Master's                             Director   
..    ...     ...             ...                                  ...   
348  28.0  Female      Bachelor's            Junior Operations Manager   
349  36.0    Male      Bachelor's  Senior Business Development Manager   
350  44.0  Female             PhD                Senior Data Scientist   
351  31.0    Male      Bachelor's         Junior Marketing Coordinator   
371  43.0    Male        Master's               Director of Operations   

     Years of Experience    Salary  
0                    5.0   90000.0  
1                    3.0   65000.0  


In [59]:
# dropping salaries below 0
# keeps the salaries that are greater than or equal to 0
df = df[df['Salary'] >= 0]
# dropping age above 150
df = df[df['Age'] < 150]
print(df)

      Age  Gender Education Level                            Job Title  \
0    32.0    Male      Bachelor's                    Software Engineer   
1    28.0  Female        Master's                         Data Analyst   
2    45.0    Male             PhD                       Senior Manager   
3    36.0  Female      Bachelor's                      Sales Associate   
4    52.0    Male        Master's                             Director   
..    ...     ...             ...                                  ...   
348  28.0  Female      Bachelor's            Junior Operations Manager   
349  36.0    Male      Bachelor's  Senior Business Development Manager   
350  44.0  Female             PhD                Senior Data Scientist   
351  31.0    Male      Bachelor's         Junior Marketing Coordinator   
371  43.0    Male        Master's               Director of Operations   

     Years of Experience    Salary  
0                    5.0   90000.0  
1                    3.0   65000.0  


<b> Adding one hot encoding </b> and displaying the first 5 rows of results

In [60]:
one_hot = pd.get_dummies(df)
one_hot.head()
print(one_hot)

      Age  Years of Experience    Salary  Gender_Female  Gender_Male  \
0    32.0                  5.0   90000.0          False         True   
1    28.0                  3.0   65000.0           True        False   
2    45.0                 15.0  150000.0          False         True   
3    36.0                  7.0   60000.0           True        False   
4    52.0                 20.0  200000.0          False         True   
..    ...                  ...       ...            ...          ...   
348  28.0                  1.0   35000.0           True        False   
349  36.0                  8.0  110000.0          False         True   
350  44.0                 16.0  160000.0           True        False   
351  31.0                  3.0   55000.0          False         True   
371  43.0                 19.0  170000.0          False         True   

     Education Level_Bachelor's  Education Level_Master's  \
0                          True                     False   
1            

<b> Plotly </b> <br>
Using plotly to visualize the target data with the different features

In [61]:
import plotly.express as px

# see overall distribution
distribution = px.histogram(df, x ="Salary", color_discrete_sequence=['lightskyblue'])
distribution.show()

In [62]:
ageGen = px.box(df, x="Salary", y="Gender", color_discrete_sequence=['pink'])
ageGen.show()

In [63]:
# use target data (salary) corresponding to each feature
salGen = px.scatter(df, x="Age", y="Salary", color_discrete_sequence=['plum'])
salGen.show()

<b>Moving onto creating the linear models</b><br>
The first step we can take is sorting out our features vs our target data

In [64]:
one_hot.columns
x = one_hot.drop(columns='Salary')
#x = one_hot.drop("Salary", axis = 1)
y = one_hot['Salary']

<b>Splitting up the training and testing data</b>

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.2,random_state=21)

Further split the training data into training set + validation set

In [66]:
X_train, X_val, Y_train, Y_val = train_test_split(X_test, Y_test, test_size=.5, random_state=21)
# splitting the testing data in half so half is testing and half is validation set

<b>Time to actually <i>train</i> the model!</b>

In [67]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression() # Creating an Instance of LinearRegression model

lm.fit(X_train,Y_train) # Train/fit on the trainingdata

print(lm.coef_)

[ 1.10951883e+03  4.86702705e+03 -9.65255119e+03  9.65255119e+03
 -1.57521746e+04  2.35616714e+03  1.33960074e+04  1.45519152e-11
  0.00000000e+00 -1.81898940e-12 -9.32232069e-12 -6.36646291e-12
  9.09494702e-12  1.36424205e-12  3.63797881e-12  3.63797881e-12
 -4.54747351e-12  5.45696821e-12  7.27595761e-12  4.54747351e-13
 -2.72848411e-12  0.00000000e+00 -1.81898940e-12 -5.68434189e-13
  0.00000000e+00 -1.81898940e-12  0.00000000e+00 -1.81898940e-12
 -1.81898940e-12 -9.09494702e-13  4.54747351e-13  0.00000000e+00
  1.98660474e+04  0.00000000e+00  1.61085392e+04  0.00000000e+00
  0.00000000e+00 -5.00097966e+03  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  4.61580387e+03  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000

<b>Predicting the data</b>

In [68]:
prediction = lm.predict(X_test)

In [69]:
import plotly.express as px

px.scatter(x = Y_test,y= prediction, color_discrete_sequence=['hotpink'])



<b>Evaluating the model</b>

In [70]:
from sklearn import metrics
print('MAE = ', metrics.mean_absolute_error(Y_test,prediction) )
print('MSE = ', metrics.mean_squared_error(Y_test,prediction))
print('RMSE = ', np.sqrt(metrics.mean_squared_error(Y_test, prediction)))

MAE =  5973.501787254443
MSE =  120252487.6453343
RMSE =  10965.969526007917
