**Training Model on BERT**

Requirements:
1. Download transformers package


Remarks:
1. May need to rebalance dataset (it is currently imbalanced)
2. May take a long time  - try with titles first

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
import transformers
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, BertModel, BertTokenizer, BertForSequenceClassification, BertTokenizerFast
from scipy.special import softmax
from datetime import datetime
from sklearn.utils import resample

In [None]:
# Load training data

# Use when I'm using Wharton account
train = pd.read_csv('drive/MyDrive/CIS520 Project/data set/train.csv')
test = pd.read_csv('drive/MyDrive/CIS520 Project/data set/test.csv')

# Get top 25% vs bottom 75%
train['top25pct'] = (train['percentile'] >= 0.75).astype(int)
test['top25pct'] = (test['percentile'] >= 0.75).astype(int)

train['published_date'] = train['published_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
test['published_date'] = test['published_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [None]:
def upsample_minority(df):

  # Upsample minority class in both the training and test data
  df_majority = df.loc[df['top25pct'] == 0, :]
  df_minority = df.loc[df['top25pct'] == 1, :]
  df_minority_upsampled = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

  # Combine together to get the upsampled training data
  df = pd.concat([df_majority, df_minority_upsampled])

  return df

In [None]:
# Upsample the minority class
train = upsample_minority(train)
test = upsample_minority(test)

train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

In [None]:
# Get Peter's test data
test_final = pd.read_csv('drive/MyDrive/CIS520 Project/data set/test_df_upsampled.csv')

In [None]:
test_final_data = pd.concat([test_final['title'], test_final['top25pct']], axis = 1)
test_final_data.columns = ['text', 'labels']

**Using BERT Classification Model from SimpleTransformers**

In [None]:
from simpletransformers.classification import ClassificationModel
import logging
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train['title'], train['top25pct'], test_size = 0.2, random_state = 42, stratify = train['top25pct'])

train_data = pd.concat([X_train, y_train], axis = 1)
valid_data = pd.concat([X_valid, y_valid], axis = 1)
test_data = pd.concat([test['title'], test['top25pct']], axis = 1)

# Rename columns
train_data.columns = ['text', 'labels']
valid_data.columns = ['text', 'labels']
test_data.columns = ['text', 'labels']

In [None]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

20067
5017
6240


In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

args = {
   'output_dir': 'drive/MyDrive/CIS520 Project/final bert model/',
   'cache_dir': 'cache/',

   'fp16': False,

   'overwrite_output_dir': True,
   'reprocess_input_data': True,
}

# Create a ClassificationModel
model = ClassificationModel('bert', 'bert-base-cased', args = args)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
# Train the model
model.train_model(train_data, eval_df = valid_data, args = {'num_train_epochs': 20})

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=20067.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 20', max=2509.0, style=ProgressStyle(d…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 15 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 16 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 17 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 18 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 19 of 20', max=2509.0, style=ProgressStyle(…





INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to drive/MyDrive/CIS520 Project/final bert model/.


(50180, 0.09237574692887576)

In [None]:
model = ClassificationModel('bert', 'drive/MyDrive/CIS520 Project/final bert model/')

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test_data, acc = accuracy_score)
test_probs = softmax(model_outputs, axis = 1)
test_preds = np.argmax(test_probs, axis = 1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6240.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=780.0), HTML(value='')))




In [None]:
np.save('drive/MyDrive/CIS520 Project/final bert model/test_probs_titles.npy', test_probs)

In [None]:
confusion_matrix(test_data['labels'], test_preds)

array([[2657,  463],
       [1520, 1600]])

In [None]:
result_final, model_outputs_final, wrong_predictions_final = model.eval_model(test_final_data, acc = accuracy_score)
test_probs_final = softmax(model_outputs_final, axis = 1)
test_preds_final = np.argmax(test_probs_final, axis = 1)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6240.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=780.0), HTML(value='')))




In [None]:
confusion_matrix(test_final_data['labels'], test_preds_final)

array([[2657,  463],
       [1520, 1600]])

In [None]:
test_final_out = pd.DataFrame(data = {'id': test_final['id'], 'bert_pred': test_probs_final[:,1]})

In [None]:
test_final_out = test_final_out.to_csv('drive/MyDrive/CIS520 Project/bert_preds.csv', index = None)

**Transformers on Content**

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train['content'], train['top_25pct'], test_size = 0.2, random_state = 42, stratify = train['top_25pct'])


train_data = pd.concat([X_train, y_train], axis = 1)
valid_data = pd.concat([X_valid, y_valid], axis = 1)
test_data = pd.concat([test['content'], test['top_25pct']], axis = 1)

# Rename columns
train_data.columns = ['text', 'labels']
valid_data.columns = ['text', 'labels']
test_data.columns = ['text', 'labels']

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

args = {
   'output_dir': 'drive/MyDrive/CIS520 Project/bert model output content',
   'cache_dir': 'cache/',

   'fp16': False,

   'overwrite_output_dir': True,
   'reprocess_input_data': True,
}

# Create a ClassificationModel
model_content = ClassificationModel('bert', 'bert-base-cased', args = args)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
# Train the model
model_content.train_model(train_data, eval_df = valid_data, args = {'num_train_epochs': 20})

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=20067.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 20', max=2509.0, style=ProgressStyle(d…






HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 20', max=2509.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 15 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 16 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 17 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 18 of 20', max=2509.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 19 of 20', max=2509.0, style=ProgressStyle(…





INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to drive/MyDrive/CIS520 Project/bert model output content.


(50180, 0.09504673607004901)

In [None]:
result, model_outputs, wrong_predictions = model_content.eval_model(test_data, acc = accuracy_score)
test_probs = softmax(model_outputs, axis = 1)
test_preds = np.argmax(test_probs, axis = 1)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=6240.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=780.0, style=ProgressStyle(descr…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.3837474062677855, 'tp': 1417, 'tn': 2780, 'fp': 340, 'fn': 1703, 'acc': 0.6725961538461539, 'eval_loss': 3.901625492418823}





In [None]:
confusion_matrix(test_data['labels'], test_preds)

array([[2780,  340],
       [1703, 1417]])

In [None]:
np.save('drive/MyDrive/CIS520 Project/bert model output content/test_probs_content.npy', test_probs)