Question of interest: What working scenario would give the best total compensation per year?

In [64]:
# libraries
import pandas as pd

In [71]:
# read in all 5 FAANG company data and combine them into one.
facebook = pd.read_csv('data/facebook.csv')
netflix = pd.read_csv('data/netflix.csv')
amazon = pd.read_csv('data/amazon.csv')
apple = pd.read_csv('data/apple.csv')
google = pd.read_csv('data/google.csv')

arr = [facebook, netflix, amazon, apple, google]
faang = pd.concat(arr).drop("location", axis = 1) # remove location

In [72]:
# create dummy variables
faang = pd.get_dummies(faang, columns = ['company', 'title'])

Index(['company', 'title', 'totalyearlycompensation', 'yearsofexperience',
       'yearsatcompany', 'basesalary', 'bonus', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College',
       'facebook', 'Apple', 'Amazon'],
      dtype='object')

In [66]:
# create summary statistics
faang.describe()

Unnamed: 0,totalyearlycompensation,yearsofexperience,yearsatcompany,basesalary,bonus,Masters_Degree,Bachelors_Degree,Doctorate_Degree,Highschool,Some_College
count,17920.0,17920.0,17920.0,17920.0,17920.0,17920.0,17920.0,17920.0,17920.0,17920.0
mean,270125.0,7.229079,2.262899,147161.745647,23102.574442,0.241964,0.162723,0.041908,0.004799,0.004241
std,158051.6,5.765433,2.600236,67179.036821,30535.354548,0.428285,0.369123,0.200386,0.069111,0.064987
min,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179000.0,3.0,0.0,121000.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,235000.0,6.0,2.0,150000.0,19000.0,0.0,0.0,0.0,0.0,0.0
75%,320000.0,10.0,3.0,170000.0,30000.0,0.0,0.0,0.0,0.0,0.0
max,4980000.0,42.0,27.0,893000.0,555000.0,1.0,1.0,1.0,1.0,1.0


Create train and test.

Different modeling methods

In [None]:
# fit linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# create linear regression model
def lm_mod(X_train, y_train, X_test, y_test):
    lm = LinearRegression()
    lm.fit(X_train, y_train) # create model
    lm_preds = lm.predict(X_test)
    mse_lm = mean_squared_error(y_test, lm_preds)
    print(mse_lm)
    
lm_mod(X_train, y_train, X_test, y_test)

In [None]:
# fit lasso regression model
from sklearn.linear_model import Lasso
from sklearn import linear_model

# create lasso regression model
def lasso_mod(X_train, y_train, X_test, y_test):
    lasso = linear_model.Lasso(alpha = 0.5)
    lasso.fit(X_train, y_train) # create model
    lasso_preds = lasso.predict(X_test)
    mse_lasso = mean_squared_error(y_test, lasso_preds)
    print(mse_lasso)
          
lasso_mod(X_train, y_train, X_test, y_test)

In [None]:
# fit K-nearest neighbors
from sklearn.neighbors import KNeighborsRegressor

# create k-nearest neighbors model
def knn_mod(X_train, y_train, X_test, y_test):
    knn = KNeighborsRegressor(n_neighbors = 10)
    knn.fit(X_train, y_train) # create model
    knn_preds = knn.predict(X_test)
    mse_knn = mean_squared_error(y_test, knn_preds)
    print(mse_knn)
        
knn_mod(X_train, y_train, X_test, y_test)

In [None]:
# multinomial naive bayes
from sklearn.naive_bayes import MultinomialNB

# create model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# create predictions
nb_preds = nb.predict(X_test)

# calculate model metrics
nb_precision = metrics.precision_score(y_test, nb_preds)
nb_recall = metrics.recall_score(y_test, nb_preds)
nb_f1 = 2 * (nb_precision * nb_recall) / (nb_precision + nb_recall)

nb_accuracy = metrics.accuracy_score(y_test, nb_preds)
nb_auc = roc_auc_score(y_test, nb_preds)

# cross validation score
nb_cv = cross_val_score(nb, X_train, y_train, cv = 10).mean()

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression

# create logistic regression model
logreg = LogisticRegression(solver = 'liblinear', C = 1)
logreg.fit(X_train, y_train)

# create predictions
logreg_preds = logreg.predict(X_test)

# calculate model metrics
logreg_precision = metrics.precision_score(y_test, logreg_preds)
logreg_recall = metrics.recall_score(y_test, logreg_preds)
logreg_f1 = 2 * (logreg_precision * logreg_recall) / (logreg_precision + logreg_recall)

logreg_accuracy = metrics.accuracy_score(y_test, logreg_preds)
logreg_auc = roc_auc_score(y_test, logreg_preds)

# cross validation score
logreg_cv = cross_val_score(logreg, X_train, y_train, cv = 10).mean()

In [None]:
# decision tree
from sklearn.tree import DecisionTreeClassifier

# create decision tree regression model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

# create predictions
tree_preds = tree.predict(X_test)

# calculate model metrics
tree_precision = metrics.precision_score(y_test, tree_preds)
tree_recall = metrics.recall_score(y_test, tree_preds)
tree_f1 = 2 * (tree_precision * tree_recall) / (tree_precision + tree_recall)

tree_accuracy = metrics.accuracy_score(y_test, tree_preds)
tree_auc = roc_auc_score(y_test, tree_preds)

# cross validation score
tree_cv = cross_val_score(tree, X_train, y_train, cv = 10).mean()

In [None]:
# random forest
from sklearn.ensemble import RandomForestClassifier

# create model
forest = RandomForestClassifier(n_estimators = 100, n_jobs = -1, max_depth = 1)
forest.fit(X_train, y_train)

# create predictions
forest_preds = forest.predict(X_test)

# calculate model metrics
forest_precision = metrics.precision_score(y_test, forest_preds)
forest_recall = metrics.recall_score(y_test, forest_preds)
forest_f1 = 2 * (forest_precision * forest_recall) / (forest_precision + forest_recall)

forest_accuracy = metrics.accuracy_score(y_test, forest_preds)
forest_auc = roc_auc_score(y_test, forest_preds)

# cross validation score
forest_cv = cross_val_score(forest, X_train, y_train, cv = 10).mean()

In [None]:
# gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

# create model
gradient = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1)
gradient.fit(X_train, y_train)

# create predictions
gradient_preds = gradient.predict(X_test)

# calculate model metrics
gradient_precision = metrics.precision_score(y_test, gradient_preds)
gradient_recall = metrics.recall_score(y_test, gradient_preds)
gradient_f1 = 2 * (gradient_precision * gradient_recall) / (gradient_precision + gradient_recall)

gradient_accuracy = metrics.accuracy_score(y_test, gradient_preds)
gradient_auc = roc_auc_score(y_test, gradient_preds)

# cross validation score
gradient_cv = cross_val_score(gradient, X_train, y_train, cv = 10).mean()