# Import required libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Helper Function

In [3]:
def display_dataset_info(dataframe):
  print('----------------------------- Salary Predictions General Information ---------------------------------')
  print(dataframe.info())

# Obtain the csv data

Use the pandas read_csv() function to convert the csv data into a dataframe

In [13]:
dataset_link = './Salary Data.csv'
df = pd.read_csv(dataset_link)

# Display the Dataset Information

In [14]:
display_dataset_info(df)

----------------------------- Salary Predictions General Information ---------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB
None


# Perform Data Preprocessing Steps.

Drop rows with empty values, drop rows with NaN values

In [15]:
# Drop duplicate values except for the first instance
df = df.drop_duplicates()
# Drop NaN values from the dataset, and check all columns
df = df.dropna(subset=["Age","Gender","Education Level","Job Title","Years of Experience","Salary"])
# Only use columns with a salary greater than 0
df = df[df['Salary'] > 0]

# Display the Dataset Information After Preprocessing

In [8]:
display_dataset_info(df)

----------------------------- Salary Predictions General Information ---------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 324 entries, 0 to 371
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  324 non-null    float64
 1   Gender               324 non-null    object 
 2   Education Level      324 non-null    object 
 3   Job Title            324 non-null    object 
 4   Years of Experience  324 non-null    float64
 5   Salary               324 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB
None


# One Hot Encode the dataset

get_dummies() will automatically one hot encode the object, string, and category data types. This will convert the Gender, Education Level, and Job Title columns in the dataset.

Produce a higher mean-squared-error than label encoding

In [9]:
df = pd.get_dummies(df)
display_dataset_info(df)

----------------------------- Salary Predictions General Information ---------------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 324 entries, 0 to 371
Columns: 182 entries, Age to Job Title_Web Developer
dtypes: bool(179), float64(3)
memory usage: 66.8 KB
None


In [10]:
# Display the first 5 rows of the one hot encoded dataframe
df.head()

Unnamed: 0,Age,Years of Experience,Salary,Gender_Female,Gender_Male,Education Level_Bachelor's,Education Level_Master's,Education Level_PhD,Job Title_Account Manager,Job Title_Accountant,...,Job Title_Supply Chain Manager,Job Title_Technical Recruiter,Job Title_Technical Support Specialist,Job Title_Technical Writer,Job Title_Training Specialist,Job Title_UX Designer,Job Title_UX Researcher,Job Title_VP of Finance,Job Title_VP of Operations,Job Title_Web Developer
0,32.0,5.0,90000.0,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,28.0,3.0,65000.0,True,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,45.0,15.0,150000.0,False,True,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,36.0,7.0,60000.0,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,52.0,20.0,200000.0,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# Display the last 5 rows of the one hot encoded dataframe
df.tail()

Unnamed: 0,Age,Years of Experience,Salary,Gender_Female,Gender_Male,Education Level_Bachelor's,Education Level_Master's,Education Level_PhD,Job Title_Account Manager,Job Title_Accountant,...,Job Title_Supply Chain Manager,Job Title_Technical Recruiter,Job Title_Technical Support Specialist,Job Title_Technical Writer,Job Title_Training Specialist,Job Title_UX Designer,Job Title_UX Researcher,Job Title_VP of Finance,Job Title_VP of Operations,Job Title_Web Developer
348,28.0,1.0,35000.0,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
349,36.0,8.0,110000.0,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
350,44.0,16.0,160000.0,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
351,31.0,3.0,55000.0,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
371,43.0,19.0,170000.0,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Label Encode the Dataset

Label encode all categorical data columns

Produces a lower mean-square-error than one hot encoding

In [16]:
le = LabelEncoder()

# Encode labels in column 'Gender'.
df['Gender'] = le.fit_transform(df['Gender'])
df['Gender'].unique()

# Encode labels in column 'Gender'.
df['Education Level'] = le.fit_transform(df['Education Level'])
df['Education Level'].unique()

# Encode labels in column 'Gender'.
df['Job Title'] = le.fit_transform(df['Job Title'])
df['Job Title'].unique()

array([159,  17, 130, 101,  22,  81,  93, 104,  82, 150, 158,  40,  36,
        96,  13,  89,  83, 116,  18, 102,   3, 172,  44,  98,  37, 157,
       160,  57, 112,  92,   6,   1,  19,  84, 167,  39,  95,  16, 103,
       169,  88,  85,   2, 162,  10,   0,  29,  41,  12,   5,  34, 171,
        38, 170, 156,  30, 115,  47,  21,  43,  14,   4, 118, 173,  99,
       166,  11, 153,  42,   9, 165, 106,   8,  56,  35,  46, 144,  91,
       164, 134, 168, 100,  76,  97,  87,  94, 122,  80, 143,   7,  20,
        45, 131,  15, 120, 161, 163, 110,  62,  86,  90,  61, 141,  66,
       121, 105, 152,  79, 154, 146,  73,  64,  54, 142,  49, 148,  65,
        70, 109,  32,  71, 111, 139,  53, 129,  59, 138,  28,  77, 149,
        31,  52, 132, 125,  50, 108, 147,  60,  25,  63,  55, 136, 124,
       155,  69, 135, 128, 145,  33, 107,  23,  74, 126, 113,  27,  48,
        78, 133, 127, 117,  51,  75, 140,  68, 151,  72, 119, 123, 114,
        67,  26, 137,  58,  24])

In [198]:
# Display the first 5 rows of the label encoded dataframe
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,1,0,159,5.0,90000.0
1,28.0,0,1,17,3.0,65000.0
2,45.0,1,2,130,15.0,150000.0
3,36.0,0,0,101,7.0,60000.0
4,52.0,1,1,22,20.0,200000.0


In [17]:
# Display the last 5 rows of the label encoded dataframe
df.tail()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
348,28.0,0,0,68,1.0,35000.0
349,36.0,1,0,111,8.0,110000.0
350,44.0,0,2,115,16.0,160000.0
351,31.0,1,0,63,3.0,55000.0
371,43.0,1,1,30,19.0,170000.0


# Create the samples and target vectors

Store the salary column as the target vector (Y), and the remaning columns as the sample vector (X)

In [18]:
X = np.array(df.loc[:, df.columns != 'Salary'])
Y = np.array(df['Salary'])

Display information about the sample and target vectors

In [19]:
print("X shape: ", X.shape)
print("Y shape: ", Y.shape)

X shape:  (324, 5)
Y shape:  (324,)


# Split the sample and target vectors into a train and test dataset

Use a 80% train set, 10% validation set, and 10% test dataset split

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
X_test, X_cv, Y_test, Y_cv = train_test_split(X_train, Y_train, test_size=0.5)

Display information about the vectors

In [21]:
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)

X_train shape:  (259, 5)
Y_train shape:  (259,)


In [22]:
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)

X_test shape:  (129, 5)
Y_test shape:  (129,)


In [23]:
print("X_cv shape: ", X_cv.shape)
print("Y_cv shape: ", Y_cv.shape)

X_cv shape:  (130, 5)
Y_cv shape:  (130,)


# Run a linear regression on the data

Use the scikit-learn library

In [24]:
reg = linear_model.LinearRegression()
reg.fit(X_train, Y_train)

Make predictions using the testing set

In [25]:
Y_pred = reg.predict(X_test)

Display information about the prediction vector

In [26]:
print("Y_pred shape: ", Y_pred.shape)

Y_pred shape:  (129,)


Evaulate the model's performance

In [27]:
# The coefficients
print("Coefficients: \n", reg.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(Y_test, Y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(Y_test, Y_pred))

Coefficients: 
 [2.91388069e+03 8.90653368e+03 1.50592886e+04 9.52172557e+00
 2.77641296e+03]
Mean squared error: 214489068.95
Coefficient of determination: 0.91
