The following utilizes LDA, NMF topic probabilities as features to perform supervised learning on the 75/25 classification. It also includes the final ensemble where predictions from all models are imported to create the final ensemble

# Preprocessing

In [253]:
#consolidate all imports here
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import *
import plotly.express as px
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Import the relevant data

In [254]:
# import upsampled test data
test_df_upsampled = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/test_df_upsampled.csv', index_col=False)

In [255]:
# # import train data

train_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/train.csv', index_col=False)
train_df = train_df.drop(columns=['Unnamed: 0'])

# import test data
test_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/test.csv', index_col=False)
test_df = test_df.drop(columns=['Unnamed: 0'])

The percentiles have already been caclulated in a different notebook. We now use that to make our classifications. 

In [256]:
# get quintiles for train data
# these can be ignored as learning from the staff we realized these were not beneficial for them
train_df['quintile'] = pd.cut(train_df['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])
# get quintiles for the test data
test_df['quintile'] = pd.cut(test_df['percentile'], [0, 0.2, 0.4, 0.6, 0.8, 1], labels = [1,2,3,4,5])

# Get top 25% vs bottom 75%
train_df['top25pct'] = (train_df['percentile'] >= 0.75).astype(int)
test_df['top25pct'] = (test_df['percentile'] >= 0.75).astype(int)


 Function taken from CIS520 HW. It helps print out grid search results 


In [None]:
# Function taken from CIS520 HW. It helps print out grid search results 

def hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test):
    
    for i in range(len(model_list)):
        print(model_list[i])
        grid_search = GridSearchCV(model_list[i], param_grid_list[i], cv = 5, scoring = 'accuracy')
        grid_search.fit(X_train, y_train)
        print('Best Parameters: {}'.format(grid_search.best_params_))
        best_cross_val_score = grid_search.best_score_
        print('Best Cross Validation Score: {}'.format(best_cross_val_score))
        # Score on the test set
        test_score = grid_search.score(X_test, y_test)
        print('Test Set Score: {}'.format(test_score))

So a problem with our classification is that a model that simply predicts 0 for each article would achieve 75% accuracy. So in order to avoid this problem we will upsample the articles part of the 25% so that both classes are equally represented

In [258]:
def upsample_minority(df):

  # Upsample minority class in both the training and test data
  df_majority = df.loc[df['top25pct'] == 0, :]
  df_minority = df.loc[df['top25pct'] == 1, :]
  df_minority_upsampled = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

  # Combine together to get the upsampled training data
  df = pd.concat([df_majority, df_minority_upsampled])

  return df

In [140]:
test_df_upsampled.shape

(6240, 24)

# Supervised learning using LDA

## Logistic Regression

In [245]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, recall_score, precision_score, f1_score, auc
from sklearn.linear_model import LogisticRegression

Calculates the different metrics helper function

In [259]:
def get_classification_metrics(actual, pred):
  print(confusion_matrix(actual, pred))
  print('Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}'.format(
      accuracy_score(actual, pred),
      precision_score(actual, pred),
      recall_score(actual, pred),
      f1_score(actual, pred)))

Upsample and then get our X_trains and y_trains and tests etc. To be used throughout this LDA models

In [260]:
# Upsample minority
train_df_upsampled = upsample_minority(train_df)
# test_df_upsampled = upsample_minority(test_df)

# Vectorize words
X_train_upsampled = train_df_upsampled[['prob_LDAtopic_1','prob_LDAtopic_2','prob_LDAtopic_3','prob_LDAtopic_4','prob_LDAtopic_5','prob_LDAtopic_6','prob_LDAtopic_7']]
X_test_upsampled = test_df_upsampled[['prob_LDAtopic_1','prob_LDAtopic_2','prob_LDAtopic_3','prob_LDAtopic_4','prob_LDAtopic_5','prob_LDAtopic_6','prob_LDAtopic_7']]
X_train = X_train_upsampled
X_test = X_test_upsampled

y_train_upsampled = train_df_upsampled['top25pct']
y_test_upsampled = test_df_upsampled['top25pct']
y_train = y_train_upsampled
y_test = y_test_upsampled

Grid search with the help of earlier methods on the value of C for logistic regression

In [146]:
# Hyperparameter tuning
param_grid_logistic = {'C': [0.001, 0.01, 0.1, 1, 10]}

model_list = [LogisticRegression(max_iter = 1000)]
param_grid_list = [param_grid_logistic]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best Parameters: {'C': 0.01}
Best Cross Validation Score: 0.6551980125627095
Test Set Score: 0.6607371794871795


Use these values in Regression

In [261]:
clf = LogisticRegression(C=10, random_state=0, max_iter=1000)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)


Analyze the metrics

In [262]:
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1799 1321]
 [ 793 2327]]
Accuracy: 0.6612179487179487, Precision: 0.6378837719298246, Recall: 0.7458333333333333, F1 Score: 0.6876477541371158


Then store the predictions to be used later in the ensemble

In [148]:
# Save predictions
log_predictions = y_test_pred

## Random Forest

In [149]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
from sklearn.model_selection import GridSearchCV

In [153]:
param_grid_rf = [
    {'max_depth': [3, 5, 7], 'max_features': [5, 7], 'n_estimators': [25]}
]

In [154]:
# Function taken from CIS520 HW

def hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test):
    
    for i in range(len(model_list)):
        print(model_list[i])
        grid_search = GridSearchCV(model_list[i], param_grid_list[i], cv = 5, scoring = 'accuracy')
        grid_search.fit(X_train, y_train)
        print('Best Parameters: {}'.format(grid_search.best_params_))
        best_cross_val_score = grid_search.best_score_
        print('Best Cross Validation Score: {}'.format(best_cross_val_score))
        # Score on the test set
        test_score = grid_search.score(X_test, y_test)
        print('Test Set Score: {}'.format(test_score))

In [155]:
model_list = [RandomForestClassifier()]
param_grid_list = [param_grid_rf]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best Parameters: {'max_depth': 7, 'max_features': 5, 'n_estimators': 25}
Best Cross Validation Score: 0.704074400626387
Test Set Score: 0.691025641025641


In [263]:
clf = RandomForestClassifier(max_depth=7, max_features=5, n_estimators=25)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)

In [264]:
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1863 1257]
 [ 675 2445]]
Accuracy: 0.6903846153846154, Precision: 0.660453808752026, Recall: 0.7836538461538461, F1 Score: 0.7167985927880388


In [157]:
rf_predictions = y_test_pred

## Ada Boost

In [265]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

learning_rate = 0.1
max_depth = 5
random_state = 20

base_estimator = DecisionTreeClassifier(max_depth=max_depth)
clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=5, learning_rate=learning_rate, random_state=random_state)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)
clf.score(X_test, y_test)

0.6858974358974359

In [266]:
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1835 1285]
 [ 675 2445]]
Accuracy: 0.6858974358974359, Precision: 0.6554959785522788, Recall: 0.7836538461538461, F1 Score: 0.7138686131386862


In [159]:
ada_predictions = y_test_pred

## SVM

In [161]:
from sklearn.svm import SVC

In [162]:
list_kernel_type = ['linear', 'poly', 'rbf']
random_state = 20

objs_KSVM = [SVC(kernel=kernel_type, random_state=random_state) for kernel_type in list_kernel_type]

for model in objs_KSVM:
  model.fit(X_train, y_train)
  print(model.score(X_test, y_test))

0.6530448717948718
0.6682692307692307
0.6716346153846153


In [267]:
clf = SVC(kernel='rbf', random_state=random_state)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1671 1449]
 [ 600 2520]]
Accuracy: 0.6716346153846153, Precision: 0.6349206349206349, Recall: 0.8076923076923077, F1 Score: 0.7109606432501058


In [164]:
svm_predictions = y_test_pred

#Supervised Dataset

In [165]:
supervised_lda_df = pd.DataFrame(log_predictions, columns = ['log_predictions_lda'])

In [166]:
supervised_lda_df['rf_predictions_lda'] = rf_predictions

In [167]:
supervised_lda_df['ada_predictions_lda'] = ada_predictions

In [168]:
supervised_lda_df['svm_predictions_lda'] = svm_predictions

# Supervised Learning using NMF

In [268]:
# import upsampled test data
test_df_upsampled = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/NMF_test.csv', index_col=False)
train_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/NMF_train.csv', index_col=False)

In [269]:

# Get top 25% vs bottom 75%
train_df['top25pct'] = (train_df['percentile'] >= 0.75).astype(int)


## Logistic Regression

In [171]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, mean_absolute_error, recall_score, precision_score, f1_score, auc
from sklearn.linear_model import LogisticRegression

In [172]:
def get_classification_metrics(actual, pred):
  print(confusion_matrix(actual, pred))
  print('Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}'.format(
      accuracy_score(actual, pred),
      precision_score(actual, pred),
      recall_score(actual, pred),
      f1_score(actual, pred)))

In [270]:
# Upsample minority
train_df_upsampled = upsample_minority(train_df)
# test_df_upsampled = upsample_minority(test_df)

# Vectorize words
X_train_upsampled = train_df_upsampled[['prob_NMFtopic_1','prob_NMFtopic_2','prob_NMFtopic_3','prob_NMFtopic_4','prob_NMFtopic_5','prob_NMFtopic_6','prob_NMFtopic_7','prob_NMFtopic_8','prob_NMFtopic_9','prob_NMFtopic_10','prob_NMFtopic_11','prob_NMFtopic_12','prob_NMFtopic_13','prob_NMFtopic_14','prob_NMFtopic_15']]
X_test_upsampled = test_df_upsampled[['prob_NMFtopic_1','prob_NMFtopic_2','prob_NMFtopic_3','prob_NMFtopic_4','prob_NMFtopic_5','prob_NMFtopic_6','prob_NMFtopic_7','prob_NMFtopic_8','prob_NMFtopic_9','prob_NMFtopic_10','prob_NMFtopic_11','prob_NMFtopic_12','prob_NMFtopic_13','prob_NMFtopic_14','prob_NMFtopic_15']]
X_train = X_train_upsampled
X_test = X_test_upsampled

y_train_upsampled = train_df_upsampled['top25pct']
y_test_upsampled = test_df_upsampled['top25pct']
y_train = y_train_upsampled
y_test = y_test_upsampled

In [176]:
# Hyperparameter tuning
param_grid_logistic = {'C': [0.001, 0.01, 0.1, 1, 10]}

model_list = [LogisticRegression(max_iter = 1000)]
param_grid_list = [param_grid_logistic]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best Parameters: {'C': 0.001}
Best Cross Validation Score: 0.6418826071103065
Test Set Score: 0.6426282051282052


In [271]:
clf = LogisticRegression(C=0.001, random_state=0, max_iter=1000)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)


In [272]:
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1317 1803]
 [ 427 2693]]
Accuracy: 0.6426282051282052, Precision: 0.5989768683274022, Recall: 0.8631410256410257, F1 Score: 0.7071953781512604


In [178]:
# Save predictions
log_predictions = y_test_pred

## Random Forest

In [179]:
from sklearn.ensemble import RandomForestClassifier

In [181]:
y_train_pred = clf.predict(X_train)
# Confusion Matrix
print('Training')
get_classification_metrics(y_train, y_train_pred)

# Fit on test data
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Training
[[12440   102]
 [   12 12530]]
Accuracy: 0.9954552702918195, Precision: 0.9919252691576947, Recall: 0.9990432147982777, F1 Score: 0.9954715182330977
Test
[[2637  483]
 [1743 1377]]
Accuracy: 0.6432692307692308, Precision: 0.7403225806451613, Recall: 0.44134615384615383, F1 Score: 0.5530120481927712


In [182]:
from sklearn.model_selection import GridSearchCV

In [183]:
param_grid_rf = [
    {'max_depth': [ 5,7, 10, 13, 15], 'max_features': [7, 10, 15], 'n_estimators': [ 25, 50]}
]

In [185]:
model_list = [RandomForestClassifier()]
param_grid_list = [param_grid_rf]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best Parameters: {'max_depth': 15, 'max_features': 15, 'n_estimators': 50}
Best Cross Validation Score: 0.8343168474395986
Test Set Score: 0.6533653846153846


In [273]:
clf = RandomForestClassifier(max_depth=15, max_features=10, n_estimators=50)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[2420  700]
 [1416 1704]]
Accuracy: 0.6608974358974359, Precision: 0.7088186356073212, Recall: 0.5461538461538461, F1 Score: 0.6169442433019551


In [187]:
rf_predictions = y_test_pred

## Ada Boost

In [274]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

learning_rate = 0.1
max_depth = 4
random_state = 20

base_estimator = DecisionTreeClassifier(max_depth=max_depth)
clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=5, learning_rate=learning_rate, random_state=random_state)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)
clf.score(X_test, y_test)

0.6608974358974359

In [189]:
ada_predictions = y_test_pred

In [275]:
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1646 1474]
 [ 642 2478]]
Accuracy: 0.6608974358974359, Precision: 0.6270242914979757, Recall: 0.7942307692307692, F1 Score: 0.7007918552036199


## SVM

In [191]:
from sklearn.svm import SVC

In [192]:
list_kernel_type = ['linear', 'poly', 'rbf']
random_state = 20

objs_KSVM = [SVC(kernel=kernel_type, random_state=random_state) for kernel_type in list_kernel_type]

for model in objs_KSVM:
  model.fit(X_train, y_train)
  print(model.score(X_test, y_test))

0.6435897435897436
0.6572115384615385
0.6732371794871795


In [276]:
clf = SVC(kernel='rbf', random_state=random_state)
clf.fit(X_train_upsampled, y_train_upsampled)
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[1667 1453]
 [ 586 2534]]
Accuracy: 0.6732371794871795, Precision: 0.635565588161525, Recall: 0.8121794871794872, F1 Score: 0.7130997607992121


In [194]:
svm_predictions = y_test_pred

#Supervised Dataset

In [195]:
supervised_nmf_df = pd.DataFrame(log_predictions, columns = ['log_predictions_nmf'])

In [196]:
supervised_nmf_df['rf_predictions_nmf'] = rf_predictions

In [197]:
supervised_nmf_df['ada_predictions_nmf'] = ada_predictions

In [198]:
supervised_nmf_df['svm_predictions_nmf'] = svm_predictions

# Gather All Predictions

In [206]:
supervised_BOW_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/ensemble/supervised_BOW_df.csv', index_col=False)
supervised_BOW_df

Unnamed: 0.1,Unnamed: 0,Ridge_Predictions,log_predictions,rf_predictions,ada_predictions,svm_predictions
0,0,0.529877,0,0,0,0
1,1,0.647678,0,1,1,0
2,2,0.621598,0,0,0,0
3,3,0.544618,1,1,1,1
4,4,0.420219,0,0,1,0
...,...,...,...,...,...,...
6235,6235,0.538141,0,1,0,0
6236,6236,0.574122,1,1,0,1
6237,6237,0.701051,0,1,1,0
6238,6238,0.489858,0,1,0,0


In [200]:
bert_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/bert_preds.csv', index_col=False)
bert_df

Unnamed: 0,id,bert_pred
0,190708,0.000002
1,190727,0.995266
2,190777,0.000002
3,190796,0.000003
4,190724,0.000002
...,...,...
6235,132366,0.000002
6236,187298,0.999888
6237,189504,0.000028
6238,139327,0.000002


In [202]:
deep_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/ensemble/deeplearning_preds.csv', index_col=False)
deep_df

Unnamed: 0,id,prediction
0,190708,0.436028
1,190727,0.612606
2,190777,0.426674
3,190796,0.714953
4,190724,0.539045
...,...,...
6235,132366,0.306614
6236,187298,0.750671
6237,189504,0.435435
6238,139327,0.438436


In [203]:
pos_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/ensemble/test_df_upsampled_pos_predictions.csv', index_col=False)
pos_df

Unnamed: 0,id,title,title_url,content,published_date,article,pageviews,avgTimeOnPage,days,processed_content,year_month,percentile,prob_LDAtopic_1,prob_LDAtopic_2,prob_LDAtopic_3,prob_LDAtopic_4,prob_LDAtopic_5,prob_LDAtopic_6,prob_LDAtopic_7,authors,tags,author_positions,quintile,top25pct,ridge_predictions_pos,logistic_predictions_pos,rf_predictions_pos,ada_predictions_pos,svm_linear_pos
0,190708,Students condemn anti-semitic posts by Philade...,desean-jackson-philadelphia-eagles-anti-semiti...,"Earlier this month, DeSean Jackson of the Phil...",2020-07-29 06:41:32,/article/2020/07/desean-jackson-philadelphia-e...,633.0,146.007246,7,earlier month desean jackson philadelphia eagl...,2020-07,0.401899,0.065902,0.000780,0.000780,0.047355,0.883625,0.000779,0.000780,Tori Sousa,"news, front, gender-diversity, beats","GA, DP GA, News Beat",3,0,0.555398,1,1,1,1
1,190727,Students opting to stay home this fall cite he...,penn-return-to-campus-hybrid-classes-from-home,Although Penn has invited all students back to...,2020-07-29 04:37:38,/article/2020/07/penn-return-to-campus-hybrid-...,861.0,178.692308,7,although penn invited student back campus fall...,2020-07,0.556962,0.000814,0.000815,0.146737,0.451331,0.000814,0.275696,0.123793,Isaac Lee,"academics, news, front, general-assignment","GA, DP GA, News GA",3,0,0.523348,1,0,1,1
2,190777,Penn Med study finds low risk of strokes in ho...,penn-medicine-coronavirus-stroke-study,"Contrary to findings from previous studies, ne...",2020-07-28 23:03:39,/article/2020/07/penn-medicine-coronavirus-str...,145.0,186.312500,7,contrary finding previous study new penn medic...,2020-07,0.044304,0.000902,0.000903,0.106221,0.000902,0.000908,0.889263,0.000900,Elizabeth Meisenzahl,"news, front, penn-medicine, real-time, news-co...","GA, DP GA, News Beat",1,0,0.555398,1,1,1,1
3,190796,An Unremembered Icon: The Story of 1896 Penn g...,george-orton-penn-philadelphia-canada-1900-oly...,Clasping the rectangular bronze medal with his...,2020-07-28 04:02:53,/article/2020/07/george-orton-penn-philadelphi...,1388.0,210.709302,7,clasping rectangular bronze medal left hand — ...,2020-07,0.658228,0.073529,0.329699,0.026752,0.000453,0.065123,0.035681,0.468764,Brandon Pride,"sports, front, featured, sports-feature, track","Sports, DP Sports",4,0,0.333023,0,0,0,0
4,190724,New Student Orientation and Convocation will b...,penn-convocation-new-student-orientation-online,"New Student Orientation and Convocation, stapl...",2020-07-23 02:29:03,/article/2020/07/penn-convocation-new-student-...,914.0,140.356757,7,new student orientation convocation staple fir...,2020-07,0.569620,0.000818,0.000818,0.000819,0.415121,0.000820,0.307416,0.274188,Celia Kreth,"studentlife, top, news, beats, latest-covid, c...","GA, DP GA, News GA",3,0,0.523348,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6235,132366,Bioengineers use 3-D printer to create human o...,bioengineers-use-3-d-printer,A group of bioengineers at Penn is one step cl...,2012-09-25 01:04:18,/article/2012/09/bioengineers-use-3-d-printer,639.0,128.881720,7,group bioengineers penn one step closer toward...,2012-09,0.850900,0.000615,0.000616,0.000617,0.008801,0.000615,0.957171,0.031566,Tanvir Aslam,"centerpiece, news, cmn_mailfeed, school_of_eng...","N, e, w, s, ,, , B, e, a, t, , R, e, p, o, r...",5,1,0.493316,0,0,1,1
6236,187298,"International Affairs Association suspended, l...",penn-iaa-model-un-osa-shut-down,The University has suspended the International...,2020-02-05 04:49:38,/article/2020/02/penn-iaa-model-un-osa-shut-down,10715.0,177.142857,7,university suspended international affair asso...,2020-02,0.986159,0.000956,0.000955,0.183961,0.811262,0.000956,0.000956,0.000955,"Conor Murray, Elizabeth Meisenzahl","studentlife, top, news, front, beats","GA, DP City Beat, Campus Beat, News GA, GA, DP...",5,1,0.683369,1,1,1,1
6237,189504,University's retrieval policy draws mixed reac...,penn-residential-services-moveout-shipping,"Some students, having been unable to retrieve ...",2020-05-26 04:57:25,/article/2020/05/penn-residential-services-mov...,3079.0,168.355932,7,student unable retrieve belonging dorm month f...,2020-05,0.907801,0.000690,0.000690,0.657349,0.311565,0.000689,0.028326,0.000690,Lindsey Perlman,"news, front, housing-dining, general-assignmen...","News GA, DP GA, News GA",5,1,0.515227,1,0,1,1
6238,139327,"Busting barriers, Penn's top cop celebrates 20...",busting-barriers-penns-top-cop-20-years-mauree...,"The two of them stood still, guns cocked and p...",2014-11-10 06:43:00,/article/2014/11/busting-barriers-penns-top-co...,1189.0,339.948000,7,two stood still gun cocked pointed one another...,2014-11,0.891566,0.000159,0.036371,0.583848,0.079680,0.056107,0.014475,0.229361,Jill Castellano,"newsletter, centerpiece, news, administration,...","City Beat, DP Senior Writer",5,1,0.639404,1,1,1,1


In [204]:
tags_df = pd.read_csv('/content/drive/MyDrive/CIS520 Project/data set/ensemble/test_df_upsampled_tags_predictions.csv', index_col=False)
tags_df

Unnamed: 0,id,title,title_url,content,published_date,article,pageviews,avgTimeOnPage,days,processed_content,year_month,percentile,prob_LDAtopic_1,prob_LDAtopic_2,prob_LDAtopic_3,prob_LDAtopic_4,prob_LDAtopic_5,prob_LDAtopic_6,prob_LDAtopic_7,authors,tags,author_positions,quintile,top25pct,ridge_predictions_tags,logistic_predictions_tags,rf_predictions_tags,ada_predictions_tags,svm_linear_tags
0,190708,Students condemn anti-semitic posts by Philade...,desean-jackson-philadelphia-eagles-anti-semiti...,"Earlier this month, DeSean Jackson of the Phil...",2020-07-29 06:41:32,/article/2020/07/desean-jackson-philadelphia-e...,633.0,146.007246,7,earlier month desean jackson philadelphia eagl...,2020-07,0.401899,0.065902,0.000780,0.000780,0.047355,0.883625,0.000779,0.000780,Tori Sousa,"news, front, gender-diversity, beats","GA, DP GA, News Beat",3,0,0.554656,1,1,1,1
1,190727,Students opting to stay home this fall cite he...,penn-return-to-campus-hybrid-classes-from-home,Although Penn has invited all students back to...,2020-07-29 04:37:38,/article/2020/07/penn-return-to-campus-hybrid-...,861.0,178.692308,7,although penn invited student back campus fall...,2020-07,0.556962,0.000814,0.000815,0.146737,0.451331,0.000814,0.275696,0.123793,Isaac Lee,"academics, news, front, general-assignment","GA, DP GA, News GA",3,0,0.524557,1,0,1,0
2,190777,Penn Med study finds low risk of strokes in ho...,penn-medicine-coronavirus-stroke-study,"Contrary to findings from previous studies, ne...",2020-07-28 23:03:39,/article/2020/07/penn-medicine-coronavirus-str...,145.0,186.312500,7,contrary finding previous study new penn medic...,2020-07,0.044304,0.000902,0.000903,0.106221,0.000902,0.000908,0.889263,0.000900,Elizabeth Meisenzahl,"news, front, penn-medicine, real-time, news-co...","GA, DP GA, News Beat",1,0,0.435747,1,0,1,0
3,190796,An Unremembered Icon: The Story of 1896 Penn g...,george-orton-penn-philadelphia-canada-1900-oly...,Clasping the rectangular bronze medal with his...,2020-07-28 04:02:53,/article/2020/07/george-orton-penn-philadelphi...,1388.0,210.709302,7,clasping rectangular bronze medal left hand — ...,2020-07,0.658228,0.073529,0.329699,0.026752,0.000453,0.065123,0.035681,0.468764,Brandon Pride,"sports, front, featured, sports-feature, track","Sports, DP Sports",4,0,0.546499,0,0,0,0
4,190724,New Student Orientation and Convocation will b...,penn-convocation-new-student-orientation-online,"New Student Orientation and Convocation, stapl...",2020-07-23 02:29:03,/article/2020/07/penn-convocation-new-student-...,914.0,140.356757,7,new student orientation convocation staple fir...,2020-07,0.569620,0.000818,0.000818,0.000819,0.415121,0.000820,0.307416,0.274188,Celia Kreth,"studentlife, top, news, beats, latest-covid, c...","GA, DP GA, News GA",3,0,0.590976,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6235,132366,Bioengineers use 3-D printer to create human o...,bioengineers-use-3-d-printer,A group of bioengineers at Penn is one step cl...,2012-09-25 01:04:18,/article/2012/09/bioengineers-use-3-d-printer,639.0,128.881720,7,group bioengineers penn one step closer toward...,2012-09,0.850900,0.000615,0.000616,0.000617,0.008801,0.000615,0.957171,0.031566,Tanvir Aslam,"centerpiece, news, cmn_mailfeed, school_of_eng...","N, e, w, s, ,, , B, e, a, t, , R, e, p, o, r...",5,1,0.658718,1,1,1,0
6236,187298,"International Affairs Association suspended, l...",penn-iaa-model-un-osa-shut-down,The University has suspended the International...,2020-02-05 04:49:38,/article/2020/02/penn-iaa-model-un-osa-shut-down,10715.0,177.142857,7,university suspended international affair asso...,2020-02,0.986159,0.000956,0.000955,0.183961,0.811262,0.000956,0.000956,0.000955,"Conor Murray, Elizabeth Meisenzahl","studentlife, top, news, front, beats","GA, DP City Beat, Campus Beat, News GA, GA, DP...",5,1,0.632183,1,1,1,1
6237,189504,University's retrieval policy draws mixed reac...,penn-residential-services-moveout-shipping,"Some students, having been unable to retrieve ...",2020-05-26 04:57:25,/article/2020/05/penn-residential-services-mov...,3079.0,168.355932,7,student unable retrieve belonging dorm month f...,2020-05,0.907801,0.000690,0.000690,0.657349,0.311565,0.000689,0.028326,0.000690,Lindsey Perlman,"news, front, housing-dining, general-assignmen...","News GA, DP GA, News GA",5,1,0.625466,1,1,1,0
6238,139327,"Busting barriers, Penn's top cop celebrates 20...",busting-barriers-penns-top-cop-20-years-mauree...,"The two of them stood still, guns cocked and p...",2014-11-10 06:43:00,/article/2014/11/busting-barriers-penns-top-co...,1189.0,339.948000,7,two stood still gun cocked pointed one another...,2014-11,0.891566,0.000159,0.036371,0.583848,0.079680,0.056107,0.014475,0.229361,Jill Castellano,"newsletter, centerpiece, news, administration,...","City Beat, DP Senior Writer",5,1,0.851941,1,1,1,1


In [216]:
#Combine all the predictions:

ensemble = pd.DataFrame(bert_df['bert_pred'], columns = ['bert'])
ensemble[['ridge_bow','log_bow','rf_bow','ada_bow','svm_bow']]=supervised_BOW_df[['Ridge_Predictions'	,'log_predictions'	,'rf_predictions'	,'ada_predictions'	,'svm_predictions']]
ensemble[['log_lda'	,'rf_lda',	'ada_lda',	'svm_lda']] = supervised_lda_df[['log_predictions_lda'	,'rf_predictions_lda'	,'ada_predictions_lda'	,'svm_predictions_lda']]
ensemble[['log_nmf'	,'rf_nmf',	'ada_nmf',	'svm_nmf']] = supervised_nmf_df[['log_predictions_nmf'	,'rf_predictions_nmf'	,'ada_predictions_nmf'	,'svm_predictions_nmf']]
ensemble['bert']=bert_df['bert_pred']
ensemble['deep']=deep_df['prediction']
ensemble[['ridge_tags','log_tags','rf_tags','ada_tags','svm_tags']]=tags_df[['ridge_predictions_tags',	'logistic_predictions_tags',	'rf_predictions_tags',	'ada_predictions_tags',	'svm_linear_tags']]
ensemble[['ridge_pos','log_pos','rf_pos','ada_tags','svm_tags']]=tags_df[['ridge_predictions_tags',	'logistic_predictions_tags',	'rf_predictions_tags',	'ada_predictions_tags',	'svm_linear_tags']]


ensemble

Unnamed: 0,bert,ridge_bow,log_bow,rf_bow,ada_bow,svm_bow,log_lda,rf_lda,ada_lda,svm_lda,log_nmf,rf_nmf,ada_nmf,svm_nmf,deep,ridge_tags,log_tags,rf_tags,ada_tags,svm_tags,ridge_pos,log_pos,rf_pos
0,0.000002,0.529877,0,0,0,0,0,1,1,0,1,0,1,1,0.436028,0.554656,1,1,1,1,0.554656,1,1
1,0.995266,0.647678,0,1,1,0,1,1,1,1,1,1,1,1,0.612606,0.524557,1,0,1,0,0.524557,1,0
2,0.000002,0.621598,0,0,0,0,0,0,0,0,1,0,0,0,0.426674,0.435747,1,0,1,0,0.435747,1,0
3,0.000003,0.544618,1,1,1,1,0,0,0,0,0,0,0,0,0.714953,0.546499,0,0,0,0,0.546499,0,0
4,0.000002,0.420219,0,0,1,0,1,1,1,1,1,1,1,1,0.539045,0.590976,1,0,1,1,0.590976,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6235,0.000002,0.538141,0,1,0,0,0,0,0,0,1,0,0,0,0.306614,0.658718,1,1,1,0,0.658718,1,1
6236,0.999888,0.574122,1,1,0,1,1,1,1,1,1,0,1,1,0.750671,0.632183,1,1,1,1,0.632183,1,1
6237,0.000028,0.701051,0,1,1,0,1,1,1,1,1,1,1,1,0.435435,0.625466,1,1,1,0,0.625466,1,1
6238,0.000002,0.489858,0,1,0,0,1,1,1,1,1,0,1,1,0.438436,0.851941,1,1,1,1,0.851941,1,1


In [218]:
ensemble.to_csv("ensemble.csv")
!cp ensemble.csv "/content/drive/MyDrive/CIS520 Project/data set/ensemble/"

# Ensemble 

In [284]:
#First split the test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ensemble, test_df_upsampled['top25pct'], test_size=0.33, random_state=42)

In [227]:
#Random Forest
param_grid_rf = [
    {'max_depth': [ 5,7, 10, 13, 15, 20], 'max_features': [7, 10, 15, 20, 23], 'n_estimators': [ 10, 25, 50]}
]

In [228]:
model_list = [RandomForestClassifier()]
param_grid_list = [param_grid_rf]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Best Parameters: {'max_depth': 20, 'max_features': 7, 'n_estimators': 50}
Best Cross Validation Score: 0.8784688995215312
Test Set Score: 0.8907766990291263


In [230]:
clf = RandomForestClassifier(max_depth=20, max_features=7, n_estimators=50)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features=7,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [238]:

# Fit on test data
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[831 226]
 [ 66 937]]
Accuracy: 0.858252427184466, Precision: 0.8056749785038693, Recall: 0.93419740777667, F1 Score: 0.8651892890120038


In [232]:
#SVM
list_kernel_type = ['linear', 'poly', 'rbf']
random_state = 20

objs_KSVM = [SVC(kernel=kernel_type, random_state=random_state) for kernel_type in list_kernel_type]

for model in objs_KSVM:
  model.fit(X_train, y_train)
  print(model.score(X_test, y_test))

0.7436893203883496
0.7572815533980582
0.7504854368932039


In [279]:
clf = SVC(kernel='poly', random_state=random_state)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[770 287]
 [213 790]]
Accuracy: 0.7572815533980582, Precision: 0.7335190343546889, Recall: 0.7876370887337986, F1 Score: 0.7596153846153846


In [291]:
#Logistic

# Hyperparameter tuning
param_grid_logistic = {'C': [0.001,  0.1, 1, 10]}

model_list = [LogisticRegression(max_iter = 1000)]
param_grid_list = [param_grid_logistic]

hyp_tuning(model_list, param_grid_list, X_train, y_train, X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Best Parameters: {'C': 1}
Best Cross Validation Score: 0.7626794258373206
Test Set Score: 0.7441747572815534


In [292]:

# Fit on test data
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[857 200]
 [ 56 947]]
Accuracy: 0.8757281553398059, Precision: 0.8256320836965998, Recall: 0.9441674975074775, F1 Score: 0.8809302325581395


In [294]:
#Adaboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

learning_rate = 0.1
max_depth = 7
random_state = 20

base_estimator = DecisionTreeClassifier(max_depth=max_depth)
clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=5, learning_rate=learning_rate, random_state=random_state)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
clf.score(X_test, y_test)

0.7966019417475728

In [295]:

# Fit on test data
y_test_pred = clf.predict(X_test)
print('Test')
get_classification_metrics(y_test, y_test_pred)

Test
[[790 267]
 [152 851]]
Accuracy: 0.7966019417475728, Precision: 0.761180679785331, Recall: 0.8484546360917248, F1 Score: 0.8024516737388024
