# Overview

This notebook trains 5 models using *article titles* as the feature, including 1 regression model (Ridge Regression) and 4 classification models (Logistic Classification, Decision Tree, Adaboost, and SVM).

We first start with the preprocessing steps of loading the dataframes and transforming certain columns so they can be used directly.

Then, we proceed to model trainings and interpreting variable importances.

# Preprocessing

In [None]:
# initial imports
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import *
import plotly.express as px
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# import upsampled test data
test_df_upsampled = pd.read_csv('drive/MyDrive/test_df_upsampled.csv', index_col=False)

In [None]:
# Alternate way to upload data

# import train data
train_df = pd.read_csv('train.csv', index_col=False)
train_df = train_df.drop(columns=['Unnamed: 0'])

# import test data
test_df = pd.read_csv('test.csv', index_col=False)
test_df = test_df.drop(columns=['Unnamed: 0'])

In [None]:
# import train data
train_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/train.csv', index_col=False)
train_df = train_df.drop(columns=['Unnamed: 0'])

# import test data
test_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/test.csv', index_col=False)
test_df = test_df.drop(columns=['Unnamed: 0'])

In [None]:
# get quintiles for train data
train_df['quintile'] = pd.cut(train_df['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])
# get quintiles for the test data
test_df['quintile'] = pd.cut(test_df['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])

# Get top 25% vs bottom 75%
train_df['top25pct'] = (train_df['percentile'] >= 0.75).astype(int)
test_df['top25pct'] = (test_df['percentile'] >= 0.75).astype(int)

# convert publish date to time format
train_df['published_date'] = train_df['published_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
test_df['published_date'] = test_df['published_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [None]:
# Returns the training and test data within the window
def get_window(train_df, test_df, date_start, months):
  date_end = date_start + relativedelta(months = +months)
  train_window = train_df.loc[(train_df['published_date'] >= date_start) & (train_df['published_date'] < date_end), :]
  test_window = test_df.loc[(test_df['published_date'] >= date_start) & (test_df['published_date'] < date_end), :]
  return train_window, test_window

In [None]:
# upsample the top 25% of articles
def upsample_minority(df):

  # Upsample minority class in both the training and test data
  df_majority = df.loc[df['top25pct'] == 0, :]
  df_minority = df.loc[df['top25pct'] == 1, :]
  df_minority_upsampled = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

  # Combine together to get the upsampled training data
  df = pd.concat([df_majority, df_minority_upsampled])

  return df

**Count Vectorization of Words**

In [None]:
# Input: train, test dataframe
# Output: train_titles, test_titles vectorized matrix (#doc x #words)
def convert_text_to_vectors(train_data, test_data, max_words = 2000, use_tfidf = False):

  # Doing the Vectorization of the text
  if use_tfidf == False:
    vectorizer = CountVectorizer(stop_words='english', max_features = max_words)
  else:
    vectorizer = TfidfVectorizer(stop_words = 'english', max_features = max_words)

  # tokenize and build vocab
  vectorizer.fit(train_data['title'])

  # transform text columns
  X_train = vectorizer.transform(train_data['title']).toarray()
  X_test = vectorizer.transform(test_data['title']).toarray()

  return X_train, X_test, vectorizer

In [None]:
# Upsample minority
train_df_upsampled = upsample_minority(train_df)
# test_df_upsampled = upsample_minority(test_df)

# Vectorize words
X_train_upsampled, X_test_upsampled, vec = convert_text_to_vectors(train_df_upsampled,
                                               test_df_upsampled,
                                               max_words = 2000, use_tfidf = False)

y_train_upsampled = train_df_upsampled['top25pct']
y_test_upsampled = test_df_upsampled['top25pct']

#Article Title Modeling

## Ridge Regression

Regression model on the Percentiles

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error

In [None]:
# Create and fit model

def train_ridge(X_train, X_test, y_train, y_test, alpha = 1):
  reg = Ridge(alpha = alpha)
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  
  # Training MSE and MAE
  rmse = mean_squared_error(y_train, y_train_pred) ** 0.5
  mae = mean_absolute_error(y_train, y_train_pred)

  print('Training RMSE: {}, MAE: {}'.format(rmse , mae))

  # Fit on test data
  y_test_pred = reg.predict(X_test)
  # Training MSE and MAE
  rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
  mae = mean_absolute_error(y_test, y_test_pred)
  print('Test RMSE: {}, MAE: {}'.format(rmse , mae))

  return y_train_pred, y_test_pred, reg

In [None]:
# Create and fit model

def train_ridge_upsampled(X_train, X_test, y_train, y_test, X_test_upsampled, alpha = 1):
  reg = Ridge(alpha = alpha)
  reg.fit(X_train, y_train)
  y_train_pred = reg.predict(X_train)
  
  # Training MSE and MAE
  rmse = mean_squared_error(y_train, y_train_pred) ** 0.5
  mae = mean_absolute_error(y_train, y_train_pred)

  print('Training RMSE: {}, MAE: {}'.format(rmse , mae))
  
  # Fit on test data
  y_test_pred = reg.predict(X_test_upsampled)
  # Training MSE and MAE
  # rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
  # mae = mean_absolute_error(y_test, y_test_pred)
  # print('Test RMSE: {}, MAE: {}'.format(rmse , mae))

  return y_train_pred, y_test_pred, reg

In [None]:
# Variable Importance

def get_variable_importance(reg, vectorizer, X_train_titles):

  reg_coefs = reg.coef_
  vectorizer_words = vectorizer.get_feature_names()

  word_coefs = {}
  for word, coef in zip(vectorizer_words, reg_coefs):
    word_coefs.update({word:coef})

  # Word Frequency
  word_freq = np.sum(X_train_titles, axis = 0)
  # Number of articles the word appeared in
  word_num_articles = np.sum((X_train_titles > 0).astype(int), axis = 0)

  # Store the top words in a dataframe
  word_coefs_df = pd.DataFrame({'Word': vectorizer_words, 'Frequency': word_freq, 'Articles': word_num_articles, 'Coef': reg_coefs}).sort_values('Coef', ascending = False).reset_index(drop = True)

  return word_coefs_df

In [None]:
# Run Model on All

# Vectorize words
X_train, X_test, vec = convert_text_to_vectors(train_df, 
                                               test_df, max_words = 2000, 
                                                use_tfidf = False)
y_percentiles_train = train_df['percentile']
y_percentiles_test = test_df['percentile']

# Train ridge regression
y_train_pred, y_test_pred, reg = train_ridge_upsampled(X_train, X_test, y_percentiles_train, y_percentiles_test, X_test_upsampled, alpha = 20)

# Get the variable importance coefs
word_coefs_df = get_variable_importance(reg, vec, X_train)

# Only choose words appearing more than 50 times
word_coefs_df_top = word_coefs_df.loc[word_coefs_df['Frequency'] > 50, :]

word_coefs_df_top

Training RMSE: 0.22029882222646271, MAE: 0.18178340763774342


Unnamed: 0,Word,Frequency,Articles,Coef
0,fling,101,98,0.180162
1,admissions,92,91,0.169412
7,sophomore,51,51,0.112266
17,college,387,385,0.102133
18,gutmann,119,119,0.101309
...,...,...,...,...
1994,tennis,143,141,-0.200349
1995,gymnastics,102,101,-0.207571
1996,wrestling,159,159,-0.213726
1998,volleyball,185,185,-0.260270


In [None]:
# Save the predictions
ridge_predictions = y_test_pred

In [None]:
y_train_pred, y_test_pred, reg = train_ridge(X_train, X_test, y_percentiles_train, y_percentiles_test, alpha = 20)

Training RMSE: 0.22029882222646271, MAE: 0.18178340763774342
Test RMSE: 0.23228928430865867, MAE: 0.19263014053878394


**Rolling Window Predictions**

In [None]:
# Get a list of start dates
dates = [datetime.fromtimestamp(e.timestamp()) for e in pd.date_range('2013-01-01','2020-05-02', freq='MS')]

word_coefs_df_all = pd.DataFrame()
for start_date in dates:
  print(start_date)
  # Segment by window
  train_window, test_window = get_window(train_df, test_df, start_date, 3)
  # Vectorize words
  X_train_titles, X_test_titles, vec = convert_text_to_vectors(train_window, 
                                                              test_window, max_words = 2000, 
                                                              use_tfidf = False)
  
  y_percentiles_train = train_window['percentile']
  y_percentiles_test = test_window['percentile']
  # Train ridge regression
  y_train_pred, y_test_pred, reg = train_ridge(X_train_titles, X_test_titles, y_percentiles_train, y_percentiles_test, alpha = 20)

  # Get the variable importance coefs
  word_coefs_df = get_variable_importance(reg, vec, X_train_titles)

  word_coefs_df['Month'] = start_date

  word_coefs_df_all = pd.concat([word_coefs_df_all, word_coefs_df], axis = 0)

In [None]:
top_words=  word_coefs_df_all.groupby('Word')['Frequency'].sum().sort_values(ascending = False).iloc[0:30].index.tolist()
coefs_all = word_coefs_df_all.loc[word_coefs_df_all['Word'].apply(lambda x: x in top_words), :]
coefs_all_pivot = coefs_all.pivot(index = 'Word', columns = 'Month', values = 'Coef')
#coefs_all = coefs_all.fillna(-0.4)

fig = px.line(coefs_all, x = "Month", y = "Coef", color = "Word")
fig.show()

## Logistic Regression

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, recall_score, precision_score, f1_score, auc
from sklearn.linear_model import LogisticRegression

In [None]:
# return all metrics for determining quality of model
def get_classification_metrics(actual, pred):
  print(confusion_matrix(actual, pred))
  print('Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}'.format(
      accuracy_score(actual, pred),
      precision_score(actual, pred),
      recall_score(actual, pred),
      f1_score(actual, pred)))

In [None]:
clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X_train_upsampled, y_train_upsampled)

In [None]:
# test_df_upsampled = test_df_upsampled.to_csv('test_df_upsampled.csv')

In [None]:
# Hyperparameter tuning
param_grid_logistic = {'C': [0.001, 0.01, 0.1, 1, 10]}

model_list = [LogisticRegression(max_iter = 1000)]
param_grid_list = [param_grid_logistic]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best Parameters: {'C': 10}
Best Cross Validation Score: 0.7372828396212049
Test Set Score: 0.6647435897435897


In [None]:
# run logistic regression and predict values
clf = LogisticRegression(C=10, random_state=0, max_iter=1000)
clf.fit(X_train_upsampled, y_train_upsampled)
y_train_pred = clf.predict(X_train_upsampled)
y_test_pred = clf.predict(X_test_upsampled)

# Classification metrics for training data
print('Training')
get_classification_metrics(y_train_upsampled, y_train_pred)

# Classification metrics for test data
print('Test')
get_classification_metrics(y_test_upsampled, y_test_pred)

Test
[[2306  814]
 [1278 1842]]
Accuracy: 0.6647435897435897, Precision: 0.6935240963855421, Recall: 0.5903846153846154, F1 Score: 0.6378116343490304


In [None]:
# Save predictions
log_predictions = y_test_pred

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Get window, Upsample minority, Convert text to vectors already done
clf = RandomForestClassifier(n_estimators = 100, max_depth = 20, max_features = 45, random_state=20)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features=45,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=20, verbose=0,
                       warm_start=False)

In [None]:
y_train_pred = clf.predict(X_train)
# Confusion Matrix
print('Training')
get_classification_metrics(y_train, y_train_pred)

# Fit on test data
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Training
[[ 5624  6918]
 [  748 11794]]
Accuracy: 0.6943868601498964, Precision: 0.6302907225309962, Recall: 0.9403603890926487, F1 Score: 0.7547193959173227
Test
[[1332 1788]
 [ 374 2746]]
Accuracy: 0.653525641025641, Precision: 0.6056462284958094, Recall: 0.8801282051282051, F1 Score: 0.717533315913248


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# define hyperparameter grid
param_grid_rf = [
    {'max_depth': [40, 45, 50], 'max_features': [50], 'n_estimators': [25]}
]

In [None]:
# Function taken from CIS520 HW

def hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test):
    
    for i in range(len(model_list)):
        print(model_list[i])
        grid_search = GridSearchCV(model_list[i], param_grid_list[i], cv = 5, scoring = 'accuracy')
        grid_search.fit(X_train, y_train)
        print('Best Parameters: {}'.format(grid_search.best_params_))
        best_cross_val_score = grid_search.best_score_
        print('Best Cross Validation Score: {}'.format(best_cross_val_score))
        # Score on the test set
        test_score = grid_search.score(X_test, y_test)
        print('Test Set Score: {}'.format(test_score))

In [None]:
# tune random forest model
model_list = [RandomForestClassifier()]
param_grid_list = [param_grid_rf]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best Parameters: {'max_depth': 45, 'max_features': 50, 'n_estimators': 25}
Best Cross Validation Score: 0.7137604075966276
Test Set Score: 0.6600961538461538


In [None]:
# initialize, fit, predict, and get metrics from random forest classifier
clf = RandomForestClassifier(max_depth=45, max_features=50, n_estimators=25)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test_upsampled)
get_classification_metrics(y_test_upsampled, y_test_pred)

[[1489 1631]
 [ 500 2620]]
Accuracy: 0.6584935897435897, Precision: 0.6163255704540108, Recall: 0.8397435897435898, F1 Score: 0.7108940442273777


In [None]:
# add predictions to master dataframe
rf_predictions = y_test_pred

## Ada Boost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

learning_rate = 0.1
max_depth = 200
random_state = 20

# initialize, fit, predict, and get metrics from Adaboost
base_estimator = DecisionTreeClassifier(max_depth=max_depth)
clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=5, learning_rate=learning_rate, random_state=random_state)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test_upsampled)
get_classification_metrics(y_test_upsampled, y_test_pred)

[[2503  617]
 [1799 1321]]
Accuracy: 0.6128205128205129, Precision: 0.6816305469556243, Recall: 0.4233974358974359, F1 Score: 0.5223408461842625


In [None]:
# add predictions to master dataframe
ada_predictions = y_test_pred

## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
# tune hyperparameters of SVM
list_kernel_type = ['linear', 'poly', 'rbf']
random_state = 20

objs_KSVM = [SVC(kernel=kernel_type, random_state=random_state) for kernel_type in list_kernel_type]

for model in objs_KSVM:
  model.fit(X_train, y_train)
  print(model.score(X_test, y_test))

0.6689102564102564
0.5892628205128205
0.6567307692307692


In [None]:
# initialize, fit, predict, and get metrics from SVM
clf = SVC(kernel='linear', random_state=20)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test_upsampled)

In [None]:
get_classification_metrics(y_test_upsampled, y_test_pred)

[[2320  800]
 [1266 1854]]
Accuracy: 0.6689102564102564, Precision: 0.6985681989449887, Recall: 0.5942307692307692, F1 Score: 0.6421891236577763


In [None]:
# add predictions to master dataframe
svm_predictions = y_test_pred

#Supervised Dataset

In [None]:
# initialize dataframe of supervised results with ridge predictions
supervised_df = pd.DataFrame(ridge_predictions, columns = ['Ridge_Predictions'])

In [None]:
# add logistic regression predictions
supervised_df['log_predictions'] = log_predictions

In [None]:
# add random forest predictions
supervised_df['rf_predictions'] = rf_predictions

In [None]:
# add adaboost predictions
supervised_df['ada_predictions'] = ada_predictions

In [None]:
# add SVM predictions
supervised_df['svm_predictions'] = svm_predictions

In [None]:
supervised_df.head()

Unnamed: 0,Ridge_Predictions,log_predictions,rf_predictions,ada_predictions,svm_predictions
0,0.529877,0,0,0,0
1,0.647678,0,1,1,0
2,0.621598,0,0,0,0
3,0.544618,1,1,1,1
4,0.420219,0,0,1,0


In [None]:
# export dataframe for use in ensemble model!
supervised_df.to_csv('supervised_BOW_df.csv')