In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier

**Data Preprocessing**

-- Drop Nan, Duplicates, Non-sensical entries

-- One-hot encoding

-- Split train/valid/test sets

In [4]:
salary_df = pd.read_csv('Salary Data.csv')
salary_df = salary_df.dropna()
salary_df = salary_df.drop_duplicates()
# Age / Gender / Education Level / Job Title / Years of Experience / Salary
print(pd.unique(salary_df['Age']))
print(pd.unique(salary_df['Gender']))
print(pd.unique(salary_df['Education Level']))
# print(pd.unique(salary_df['Job Title']))
print(pd.unique(salary_df['Years of Experience']))
print(pd.unique(salary_df['Salary']))
# remove 350
salary_df = salary_df[salary_df.Salary != 350]
print(pd.unique(salary_df['Salary']))

[32. 28. 45. 36. 52. 29. 42. 31. 26. 38. 48. 35. 40. 27. 44. 33. 39. 25.
 51. 34. 47. 30. 41. 37. 24. 43. 50. 46. 49. 23. 53.]
['Male' 'Female']
["Bachelor's" "Master's" 'PhD']
[ 5.   3.  15.   7.  20.   2.  12.   4.   1.  10.  18.   6.  14.  16.
  0.  22.  19.   9.  13.  11.  25.  21.   8.  17.  23.  24.   0.5  1.5]
[ 90000.  65000. 150000.  60000. 200000.  55000. 120000.  80000.  45000.
 110000.  75000. 140000. 130000.  40000. 125000. 115000.  35000. 180000.
 190000.  50000. 250000. 170000. 160000.  85000.  95000. 105000.  70000.
 100000.  30000. 135000. 220000. 175000. 185000. 145000. 155000.    350.]
[ 90000.  65000. 150000.  60000. 200000.  55000. 120000.  80000.  45000.
 110000.  75000. 140000. 130000.  40000. 125000. 115000.  35000. 180000.
 190000.  50000. 250000. 170000. 160000.  85000.  95000. 105000.  70000.
 100000.  30000. 135000. 220000. 175000. 185000. 145000. 155000.]


In [5]:
genderEncoder = OneHotEncoder(handle_unknown='error')
eduEncoder = OneHotEncoder(handle_unknown='error')
jobEncoder = OneHotEncoder(handle_unknown='error')

gender_onehot = genderEncoder.fit_transform(salary_df[['Gender']]).toarray()
# edu_onehot = pd.DataFrame(eduEncoder.fit_transform(salary_df[['Education Level']]).toarray(), columns=eduEncoder.categories_[0])
edu_onehot = eduEncoder.fit_transform(salary_df[['Education Level']]).toarray()
job_onehot = jobEncoder.fit_transform(salary_df[['Job Title']]).toarray()

# x = salary_df['Age'].astype(str) + salary_df['Years of Experience'] + gender_onehot + edu_onehot + job_onehot
age = salary_df['Age'].to_numpy().reshape(-1, 1)
years = salary_df['Years of Experience'].to_numpy().reshape(-1,1)

npArray = np.append(age, years, axis=1)
ageYearArray = npArray.copy()
npArray = np.append(npArray, gender_onehot, axis=1)
npArray = np.append(npArray, edu_onehot, axis=1)
npArray = np.append(npArray, job_onehot, axis=1)

salary = salary_df['Salary']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(npArray, salary, test_size=0.2, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [7]:
model = LinearRegression()
reg = model.fit(x_train, y_train)
print(reg.score(x_train, y_train))

y_trainPred = reg.predict(x_train)
y_valPred = reg.predict(x_valid)
y_testPred = reg.predict(x_test)

print(mean_squared_error(y_trainPred, y_train))
print(mean_squared_error(y_valPred, y_valid))
print(mean_squared_error(y_testPred, y_test))


0.9880001588217816
29184276.61749031
3.9938938250295484e+28
1.5901264454995665e+30


In [8]:
# Using only Age and Years of Experience
x_train, x_test, y_train, y_test = train_test_split(ageYearArray, salary, test_size=0.2, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

model2 = LinearRegression()
reg2 = model2.fit(x_train, y_train)
print(reg2.score(x_train, y_train))

y_trainPred = reg2.predict(x_train)
y_valPred = reg2.predict(x_valid)
y_testPred = reg2.predict(x_test)

print(mean_squared_error(y_trainPred, y_train))
print(mean_squared_error(y_valPred, y_valid))
print(mean_squared_error(y_testPred, y_test))


0.8606369262524062
338938693.7800567
339508980.21290255
267099120.00576642


In [16]:
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(2, 2), random_state=1)
mlp.fit(x_train, y_train)

y_trainPred = mlp.predict(x_train)
y_valPred = mlp.predict(x_valid)
y_testPred = mlp.predict(x_test)

print(mean_squared_error(y_trainPred, y_train))
print(mean_squared_error(y_valPred, y_valid))
print(mean_squared_error(y_testPred, y_test))

6048352713.178294
5057031250.0
6258333333.333333
