In [22]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('glassdoor_data_eda.csv')

In [3]:
df.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue', 'hourly_salary', 'min_salary',
       'max_salary', 'avg_salary', 'job_type', 'seniority',
       'cleaned_description', 'sas_flag', 'xgboost_flag', 'scikit_flag',
       'pandas_flag', 'aws_flag', 'redshift_flag', 'apache_flag', 'spark_flag',
       'hadoop_flag', 'numpy_flag', 'scipy_flag', 'pyspark_flag', 'linux_flag',
       'jupyter_flag', 'sql_flag', 'cloud_flag', 'python_flag', 'phd_flag',
       'tableau_flag', 'len_description', 'state', 'city'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,...,linux_flag,jupyter_flag,sql_flag,cloud_flag,python_flag,phd_flag,tableau_flag,len_description,state,city
0,Sr. Data Scientist,71-118\r\n,Medifast is seeking a full-time Senior Data Sc...,3.0,"Medifast, Inc.","Baltimore, MD",501 to 1000 Employees,1980.0,Company - Public,"Health, Beauty, & Fitness",...,1,1,1,1,1,1,1,3149,MD,Baltimore
1,Scientist II,49-106\r\n,Scientist II*\r\nAs a Scientist II (SII) at In...,3.3,"Invivoscribe, Inc.","San Diego, CA",51 to 200 Employees,1995.0,Company - Private,Biotech & Pharmaceuticals,...,0,0,0,0,0,1,0,2330,CA,San Diego
2,Medical Laboratory Scientist,17-23 \r\n,Job Summary:\r\n\r\nPerforms and interprets te...,3.8,Cleveland Clinic,"Akron, OH",10000+ Employees,1921.0,Nonprofit Organization,Health Care Services & Hospitals,...,0,0,0,0,0,0,0,6212,OH,Akron
3,IT Data Analyst,44-81\r\n,Federated Insurance is looking for a passionat...,3.8,Federated Mutual Insurance Company,"Owatonna, MN",1001 to 5000 Employees,1904.0,Company - Private,Insurance Carriers,...,0,0,1,0,0,0,0,2679,MN,Owatonna
4,Business Intelligence Analyst I,38-76\r\n,GreenState Credit Union\r\n\r\nBusiness Intell...,3.6,GreenState Credit Union,"North Liberty, IA",501 to 1000 Employees,1938.0,Nonprofit Organization,Banks & Credit Unions,...,0,0,1,0,1,0,0,4429,IA,North Liberty


In [5]:
# Dropping Unnecessary columns
df.drop(['Job Title', 'Salary Estimate', 'Job Description', 'Company Name', 'Location', 'Founded', 'hourly_salary', \
         'min_salary', 'max_salary', 'cleaned_description', 'len_description'], axis=1, inplace=True)

In [6]:
# Dropping rows with null numeric column
df = df.dropna(subset=['Rating'])

In [7]:
df.columns

Index(['Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'avg_salary', 'job_type', 'seniority', 'sas_flag', 'xgboost_flag',
       'scikit_flag', 'pandas_flag', 'aws_flag', 'redshift_flag',
       'apache_flag', 'spark_flag', 'hadoop_flag', 'numpy_flag', 'scipy_flag',
       'pyspark_flag', 'linux_flag', 'jupyter_flag', 'sql_flag', 'cloud_flag',
       'python_flag', 'phd_flag', 'tableau_flag', 'state', 'city'],
      dtype='object')

In [8]:
# Encoding categorical variables
df = pd.get_dummies(df)

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error

In [24]:
X = df.drop('avg_salary', axis=1)
y = df['avg_salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train size:', X_train.shape, '\nTest size:', X_test.shape  )

Train size: (441, 386) 
Test size: (49, 386)


In [53]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Train error: ', cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, lr.predict(X_test)))

Train error:  [-3.35019019e+14 -2.01253298e+14 -7.47509336e+13] 
Test error:  248795613439953.1


In [55]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print('Train error: ', cross_val_score(knn, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, knn.predict(X_test)))

Train error:  [-22.45782313 -21.40340136 -21.73877551] 
Test error:  21.910204081632653


In [56]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train, y_train)
print('Train error: ', cross_val_score(knn, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, dtree.predict(X_test)))

Train error:  [-22.45782313 -21.40340136 -21.73877551] 
Test error:  17.520408163265305


In [57]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=1)
rf.fit(X_train, y_train)
print('Train error: ', cross_val_score(rf, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, rf.predict(X_test)))

Train error:  [-17.15648413 -17.29278669 -17.87295918] 
Test error:  15.54961224489796


In [58]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':range(10,300,10),
              'criterion':('mse','mae'), 
              'max_features':('auto','sqrt','log2')
             }

In [59]:
gs = GridSearchCV(rf,parameters,scoring='neg_mean_absolute_error',cv=3)
gs.fit(X_train,y_train)

KeyboardInterrupt: 