In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Read in csv
df = pd.read_csv('https://raw.githubusercontent.com/mattbal/SalaryPredictor/main/Levels_Fyi_Salary_Data.csv')
df = df[['timestamp', 'title', 'basesalary', 'company', 'location', 'yearsofexperience']]

# Restrict dataset to only specific attributes
validLocations = ['Seattle, WA', 'Redmond, WA', 'Cupertino, WA', 'San Francisco, WA', 'New York, NY', 'Sunnyvale, WA', 'Boston, MA', 'Mountain View, CA', 'Austin, TX']
validTitles = ["Software Engineer", "Data Scientist", "Product Manager"]
validCompanies = ['Amazon', 'Apple', 'Microsoft', 'Facebook', 'Cisco', 'Google']

# Filter for specific locations, titles, and companies
df = df[(df.location.isin(validLocations)) & (df.title.isin(validTitles)) & (df.company.isin(validCompanies))]

# Filter out any salaries from pre-2020. Salaries have gone up so much since 2020,
# that anything from before then hurts our model's performance
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df[~(df['timestamp'] < '2020-01-01')]
df = df.drop('timestamp', axis=1) # drop timestamp since we no longer need it

# Set X to input parameters
X = df.drop('basesalary',axis=1)
# Set y to output parameter
y = df['basesalary']

# Convert categorical variables
converted_categories = pd.get_dummies(X,drop_first=True)
# Drop extra columns
X = X.drop('title', axis=1)
X = X.drop('location', axis=1)
X = X.drop('company', axis=1)
# Generate new columns for each categorical value
X = pd.concat([X,converted_categories],axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Create and fit the LinearRegression model
LR = LinearRegression()
LR.fit(X_train,y_train)

# Genererate predictions
y_prediction = LR.predict(X_test)

# Evaluate effectiveness
score = r2_score(y_test,y_prediction)
print('r2 score =', score)
print('mean squared error = ',mean_squared_error(y_test,y_prediction))
print('root mean squared error =', np.sqrt(mean_squared_error(y_test,y_prediction)))

y_n = y_test.to_numpy()

# Print some examples of predictions vs their actual values.
for inc in range(15):
  print(y_prediction[inc], "----", y_n[inc], "----", round((y_prediction[inc] - y_n[inc]) / y_n[inc] * 100, 2), "% off")

# plot
fig = px.scatter(df, x="yearsofexperience", y="basesalary", opacity=0.65, title='STEM Salaries', color='company', hover_data=['title'])
fig2 = px.scatter(df, x="yearsofexperience", y="basesalary", opacity=0.65, title='STEM Salaries', color='company', hover_data=['title'], facet_col='location')
fig.show()
fig2.show()

r2 score = 0.5267843350249033
mean squared error =  459272196.2666296
root mean squared error = 21430.636860966813
160785.1403796602 ---- 190000.0 ---- -15.38 % off
164639.68110536822 ---- 167000.0 ---- -1.41 % off
145798.1188952852 ---- 155000.0 ---- -5.94 % off
131564.20629762896 ---- 125000.0 ---- 5.25 % off
135126.6188952852 ---- 155000.0 ---- -12.82 % off
152295.60981630572 ---- 147000.0 ---- 3.6 % off
140756.3249499727 ---- 170000.0 ---- -17.2 % off
139185.1804187227 ---- 155000.0 ---- -10.2 % off
146854.68110536822 ---- 138000.0 ---- 6.42 % off
131782.1188952852 ---- 150000.0 ---- -12.15 % off
184747.03637575396 ---- 200000.0 ---- -7.63 % off
141999.03613161333 ---- 170000.0 ---- -16.47 % off
163583.1188952852 ---- 149000.0 ---- 9.79 % off
202531.03637575396 ---- 265000.0 ---- -23.57 % off
126528.32494997271 ---- 120000.0 ---- 5.44 % off


In [None]:
validLocations = ['Seattle, WA', 'Redmond, WA', 'Cupertino, WA', 'San Francisco, WA', 'New York, NY', 'Sunnyvale, WA', 'Boston, MA', 'Mountain View, CA', 'Austin, TX']
validTitles = ["Software Engineer", "Data Scientist", "Product Manager"]
validCompanies = ['Amazon', 'Apple', 'Microsoft', 'Facebook', 'Cisco', 'Google']

def PredictSal(experience, title, company, location):
  info = list(zip([experience], [experience], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]))
  infoList = [experience, experience, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  if(title == 'Product Manager'):
    infoList[2] = 1
  if(title == 'Software Engineer'):
    infoList[3] = 1
  if(company == 'Apple'):
    infoList[4] = 1
  if(company == 'Cisco'):
    infoList[5] = 1
  if(company == 'Facebook'):
    infoList[6] = 1
  if(company == 'Google'):
    infoList[7] = 1
  if(company == 'Microsoft'):
    infoList[8] = 1
  if(location == 'Boston, MA'):
    infoList[9] = 1
  if(location == 'Mountain View, CA'):
    infoList[10] = 1
  if(location == 'New York, NY'):
    infoList[11] = 1
  if(location == 'Redmond, WA'):
    infoList[12] = 1
  if(location == 'Seattle, WA'):
    infoList[13] = 1
  
  #print([tuple(infoList)])
  infoList = [tuple(infoList)]
  print()
  print(list(zip([experience], [experience], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0])))
  print(infoList)
  print('Your predicted base salary: ', LR.predict(infoList)[0])

title = 'invalid'
company = 'invalid'
location = 'invalid'
experience = -1

while(title not in validTitles):
  print('Valid titles: ', validTitles)
  title = input('Please insert job title:  ')

while(company not in validCompanies):
  print('Valid companies: ', validCompanies)
  company = input('Please insert company name:  ')

while(location not in validLocations):
  print('Valid companies: ', validLocations)
  location = input('Please insert location:  ')

while(experience < 0):
  experience = int(input('Please insert years of experience:  '))

PredictSal(experience, title, company, location)

Valid titles:  ['Software Engineer', 'Data Scientist', 'Product Manager']
Please insert job title:  Software Engineer
Valid companies:  ['Amazon', 'Apple', 'Microsoft', 'Facebook', 'Cisco', 'Google']
Please insert company name:  Apple
Valid companies:  ['Seattle, WA', 'Redmond, WA', 'Cupertino, WA', 'San Francisco, WA', 'New York, NY', 'Sunnyvale, WA', 'Boston, MA', 'Mountain View, CA', 'Austin, TX']
Please insert location:  Seattle, WA
Please insert years of experience:  4
[(4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]
[(4, 4, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1)]
Your predicted base salary:  160785.1989734102


In [None]:
X_test

Unnamed: 0,yearsofexperience,yearsofexperience.1,title_Product Manager,title_Software Engineer,company_Apple,company_Cisco,company_Facebook,company_Google,company_Microsoft,"location_Boston, MA","location_Mountain View, CA","location_New York, NY","location_Redmond, WA","location_Seattle, WA"
46950,4.0,4.0,0,1,1,0,0,0,0,0,0,0,0,1
24026,6.0,6.0,0,1,0,0,0,1,0,0,0,1,0,0
27613,7.0,7.0,0,1,0,0,0,0,0,0,0,0,0,1
33327,2.0,2.0,0,1,0,0,0,0,1,0,0,0,0,1
51737,4.0,4.0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36803,15.0,15.0,0,1,0,0,0,0,1,0,0,0,1,0
19460,9.0,9.0,0,1,0,1,0,0,0,0,0,0,0,0
23705,0.0,0.0,0,1,0,0,0,0,0,0,0,1,0,0
19655,8.0,8.0,1,0,0,0,1,0,0,0,0,0,1,0


In [None]:
fig3 = px.scatter(df, x="yearsofexperience", y="title", opacity=0.65, title='STEM Salaries', color='company', hover_data=['title'], facet_col='location')
fig3.show()