In [None]:
# imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import sklearn.metrics as smetrics

import matplotlib.pyplot as plt

import utilities.data_utils as util
import utilities.ML_utilils as ml
import utilities.ryan_utils as rutils

## Pre-processing of data

In [None]:
# importing data for SVR
df = pd.read_csv('data/downsampled_df.csv')
X_col = df['description'].tolist()
y_col = df['annual_salary'].tolist()

In [None]:
# reduce sample size due to quickly increasing training times (2 min for 1000 samples, 12 min for 2000 samples)
SAMPLES = 2000 
# Applying some text pre-processing
X_processed = util.pre_process_descriptions(X_col)
X_bow_featurized = ml.featurize_bow(X_processed[0:SAMPLES]).toarray()
X_tfidf_featurized = ml.featurize_tfIDF(X_processed[0:SAMPLES], .25, 1).toarray()

y_featurized = y_col[0:SAMPLES]

In [None]:
# splitting the data for input into our model
TEST_SIZE = .2
X_BOW_train, X_BOW_test, y_train, y_test = train_test_split(X_bow_featurized, y_featurized, test_size=TEST_SIZE, random_state=42)
X_TFIDF_train, X_TFIDF_test, y_train, y_test = train_test_split(X_tfidf_featurized, y_featurized, test_size=TEST_SIZE, random_state=42)

In [None]:
#play around with C and Epsilon here - these are the hyperparameters to our model
C_VALUES = [1, 10, 100, 250, 500, 750, 1000, 2000, 3000, 4000]

## Train the models

In [None]:
# All of these models take ~40+ minutes to train
linear_BOW_model, linear_BOW_preds = rutils.train_and_predict_SVR('linear', X_BOW_train, y_train, X_BOW_test, C_VALUES)
poly_BOW_model, poly_BOW_preds = rutils.train_and_predict_SVR('poly', X_BOW_train, y_train, X_BOW_test, C_VALUES)
rbf_BOW_model, rbf_BOW_preds = rutils.train_and_predict_SVR('rbf', X_BOW_train, y_train, X_BOW_test, C_VALUES)

linear_TFIDF_model, linear_TFIDF_preds = rutils.train_and_predict_SVR('linear', X_TFIDF_train, y_train, X_TFIDF_test, C_VALUES)
poly_TFIDF_model, poly_TFIDF_preds = rutils.train_and_predict_SVR('poly', X_TFIDF_train, y_train, X_TFIDF_test, C_VALUES)
rbf_TFIDF_model, rbf_TFIDF_preds = rutils.train_and_predict_SVR('rbf', X_TFIDF_train, y_train, X_TFIDF_test, C_VALUES)

## Below this point, the code relates to graphing model performance

In [None]:
def graph_performance_by_error(title, all_preds, y_actual, C_values):
    mae = util.get_evaluation_metric(smetrics.mean_absolute_error, all_preds, y_actual)
    plt.xlabel("SVR C-parameter Value")
    plt.ylabel("Mean Average Error in Dollars")
    plt.plot(C_values, mae,  "-bo")
    plt.title(title)

In [None]:
TITLE_PREFIX = 'Performance of '
model_name = 'Linear BOW SVR model'
title = TITLE_PREFIX+model_name
rutils.graph_SVR_performance_by_accuracy(title, linear_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, linear_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Performance of '
model_name = 'Poly BOW SVR model'
rutils.graph_SVR_performance_by_accuracy(TITLE_PREFIX+model_name, poly_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, poly_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Performance of '
model_name = 'RBF BOW SVR model'
rutils.graph_SVR_performance_by_accuracy(TITLE_PREFIX+model_name, rbf_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, rbf_BOW_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Performance of '
model_name = 'Linear TF-IDF SVR model'
rutils.graph_SVR_performance_by_accuracy(TITLE_PREFIX+model_name, linear_TFIDF_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, linear_TFIDF_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Performance of '
model_name = 'Poly TF-IDF SVR model'
rutils.graph_SVR_performance_by_accuracy(TITLE_PREFIX+model_name, poly_TFIDF_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, poly_TFIDF_preds, y_test, C_VALUES)

In [None]:
model_name = 'RBF TF-IDF SVR model'
rutils.graph_SVR_performance_by_accuracy(TITLE_PREFIX+model_name, rbf_TFIDF_preds, y_test, C_VALUES)

In [None]:
TITLE_PREFIX = 'Mean Absolute Error of '
graph_performance_by_error(TITLE_PREFIX+model_name, rbf_TFIDF_preds, y_test, C_VALUES)

In [None]:
# Take predictions made from SVR model made with the 4th C-value (500), since that seems to yield the
# best results for the best model
best_model_predictions = linear_BOW_preds[3]
# print(best_model_predictions)
print(y_test)
util.plot_accuracy_in_buckets(y_test, best_model_predictions)