# Final Project Models
Contains all of the code from preprocessing the dataset to the final csv output from combining all of the models into one prediction function.

In [97]:
# Some of the installs that had to be done to ensure versions matched
# ! pip install 'accelerate>={ACCELERATE_MIN_VERSION}'
# ! pip install -U accelerate
# ! pip install -U transformers
#nltk.download('stopwords')
#nltk.download('punkt')

import pandas as pd
import tensorflow
import keras
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import time
import pickle

# Loading the HuggingFace dataset chosen
ds = load_dataset("fancyzhx/amazon_polarity")

# Preprocessing

Cleaning the data by combining the title and content into one text column. NA values and missing entries are not in the datset and have already been checked during exploratory analysis.

Establish functions to select only the positive or negative labeled data to ensure an even split in the data when selecting a smaller portion. 

Collecting a subset of the data, making sure it is evenly split between the two labels in both the test and train sets. The original dataset has 2 million observations which is a bit too large of a dataset for the comparison that I want to run. So we will use a random 5% of the data with 90,000 training entries (45,000 positive and 45,000 negative) and 10,000 testing entries (5,000 positive and 5,000 negative).

In [100]:
# Function that combines the title and content column into one text column
def concantinate(data_frame):
    data_frame["text"] = data_frame["title"] +  " " + data_frame['content']
    return data_frame

# Applying the above function to the HuggingFace dataset to get the final dataframe
df = ds.map(concantinate)

In [102]:
# Function to select only the obs. where the label is 0
def filter_negative_reviews(ex):
    return ex['label'] == 0

# Function to select only the obs. where the label is 1
def filter_positive_reviews(ex):
    return ex['label'] == 1

In [104]:
# Split the training and test data by label to prepare for subset selection
train_pos = df['train'].filter(filter_positive_reviews)
train_neg = df['train'].filter(filter_negative_reviews)
test_pos = df['test'].filter(filter_positive_reviews)
test_neg = df['test'].filter(filter_negative_reviews)

# Select a subset of the data since 2 million observations will take too long to run
train_pos_subset = train_pos.shuffle(seed=42).select(range(45000)) # 45,000 train label 1
train_neg_subset = train_neg.shuffle(seed=42).select(range(45000)) # 45,000 train label 0
test_pos_subset = test_pos.shuffle(seed=42).select(range(5000)) # 5,000 test label 1
test_neg_subset = test_neg.shuffle(seed=42).select(range(5000)) # 5,000 test label 0

# Combine the training data back together and split on text and label
train_text = train_pos_subset['text'] + train_neg_subset['text']
train_labels = train_pos_subset['label']  + train_neg_subset['label']

# Combine the testing data back together and split on text and label
test_text = test_pos_subset['text'] + test_neg_subset['text']
test_labels = test_pos_subset['label'] + test_neg_subset['label']

# SVM Model

In [11]:
# Chosen vectorizer to convert text to be readable for the model
# Stop words removed as word frequency is used to determine importance in TFIDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', max_df=0.95, min_df=5)

# Applying the vectorizer to train and test sets
svm_train = vectorizer.fit_transform(train_text)
svm_test = vectorizer.transform(test_text)

In [13]:
# Create the SVM model using the method from class
# Linear kernel has the best fit w/o overfitting
svm_model = SVC(kernel='linear', C=1)

In [15]:
# Fit the model on the data
svm_model.fit(svm_train, train_labels) # 18 minutes to run

In [17]:
# Get the predictions for the test data
test_preds = svm_model.predict(svm_test)

In [19]:
# Get model metrics based on the predictions for the test set
print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      5000
           1       0.88      0.88      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [21]:
# Save the model
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

# Save the vectorizer
with open('svm_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# BERT Model

In [106]:
from datasets import Dataset, concatenate_datasets
from transformers import BertTokenizerFast, DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score

In [26]:
# Combine text and label data to have all encompassing test and train set
train = concatenate_datasets([train_pos_subset, train_neg_subset])
test = concatenate_datasets([test_pos_subset, test_neg_subset])

In [14]:
# Use fast tokenizer instead of normal to speed up the process
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Function to apply to data frame to tokenize
def tokenize_function(data_frame):
    return tokenizer(data_frame['text'], 
                     padding="max_length", 
                     truncation=True, max_length=512)

# Tokenize the train and the test sets
train_tokenized = train.map(tokenize_function, batched = True)
test_tokenized = test.map(tokenize_function, batched = True)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
# Create model using distil bert, smaller and faster to train
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Function to use eval accuracy in training for best model
def compute_metrics(training):
    predictions, labels = training
    predictions = predictions.argmax(axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"eval_accuracy": accuracy}

In [22]:
# Training arguments to pass to the trainer function
training_args = TrainingArguments(
    output_dir = "./results",              
    num_train_epochs = 3,                  
    per_device_train_batch_size = 16,      
    per_device_eval_batch_size = 16,       
    warmup_steps = 500,                    
    weight_decay = 0.01,                   
    logging_dir = "./logs",                
    logging_steps = 10,                    
    eval_strategy = "epoch",         
    save_strategy = "epoch",               
    load_best_model_at_end = True,         
    metric_for_best_model = "eval_accuracy",    
    gradient_accumulation_steps = 4
)
# Small epoch and batch size speed up training time
# gradient_accumulation_steps also speeds up run time

In [24]:
# Early stopping call to break from training when improvement does not occur by 0.01 every 3 steps
early_stopping = EarlyStoppingCallback(
    early_stopping_patience = 3,         
    early_stopping_threshold = 0.01,
)

In [26]:
# Trainer function takes in the training argument, model, data, tokenizer, and early stopping call
# With this the train line is simple in the next chunk
trainer = Trainer(
    model = model,                        
    args = training_args,                 
    train_dataset = train_tokenized,        
    eval_dataset = test_tokenized,          
    processing_class = tokenizer,                
    compute_metrics = compute_metrics,
    callbacks = [early_stopping]
)

In [28]:
# Training the BERT model
trainer.train() # 5 hours to run

Epoch,Training Loss,Validation Loss,Accuracy
0,0.1246,0.134441,0.9524
1,0.0798,0.129536,0.9553
2,0.0377,0.191087,0.9562


TrainOutput(global_step=4218, training_loss=0.11398577996061107, metrics={'train_runtime': 18060.3152, 'train_samples_per_second': 14.95, 'train_steps_per_second': 0.234, 'total_flos': 3.575983920198451e+16, 'train_loss': 0.11398577996061107, 'epoch': 2.9994666666666667})

In [30]:
results = trainer.evaluate() # 4 minutes to run
print(results)

{'eval_accuracy': 0.9562, 'eval_loss': 0.19108682870864868, 'eval_runtime': 227.925, 'eval_samples_per_second': 43.874, 'eval_steps_per_second': 2.742, 'epoch': 2.9994666666666667}


In [32]:
# Save both the model and the tokenizer to use later
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

# Logistic Regression

In [108]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [120]:
# Create vectorizer like the SVM one, except keep the stopwords for Logistic Regression
tfidf = TfidfVectorizer(max_features=10000)

In [122]:
# Make a pipeline so the model and vectorizer are saved together
model_lg = make_pipeline(tfidf, LogisticRegression(solver='liblinear'))

In [124]:
# Fit the model on the training data
model_lg.fit(train_text, train_labels) # less than 1 second to run

In [126]:
# Get the accuracy of the model
model_lg.score(test_text, test_labels)

0.8988

In [38]:
# Save the model using pickle
with open('lg_model.pkl', 'wb') as f:
    pickle.dump(model_lg, f)

## Logistic Regression on Full Dataset

The above was done with the 5% database, but it runs super fast. Therefore we will now try it with the entire database to see if we can get a higher accuracy.

In [52]:
# Use the full dataset for the train and test values
train_full = df['train']
train_full_text = train_full['text']
train_full_labels = train_full['label']

test_full = df['test']
test_full_text = test_full['text']
test_full_labels = test_full['label']

In [54]:
tfidf_full = TfidfVectorizer(max_features=10000)

In [56]:
model_lg_full = make_pipeline(tfidf_full, LogisticRegression(solver='liblinear'))

In [58]:
model_lg_full.fit(train_full_text, train_full_labels) # 5 minutes to run

In [60]:
model_lg_full.score(test_full_text, test_full_labels)

0.9080975

This accuracy is only .01 better than with only 5% of the data, so we will use the smaller model. We will use the other 95% of the data for the next step.

# Making New Predictions

Only 5% of the model has been used to create all three models with accuracy from 0.88 to 0.95. We will create a function to predict new data using all three models combined to try and optimize accuracy. Taking this step will help balance for new predictions because each model has their own pros and cons.

In [66]:
import torch
import pickle
import tqdm

# This function will combine each models prediction for the string elements of a given list
# input: list of strings (text reviews)
# output: list of tuples where the tuple contains 3 values
# tuple(i = 0): final prediction based on all three models (integer)
# tuple(i = 1): confidence, either 100 or 66, either all agreed or one differed (float)
# tuple(i = 2): each individual models prediction (string formatted like list)

def sentiment_prediction(list):
    ### SVM

    # Load the pickled model
    with open('svm_model.pkl', 'rb') as f:
        svm_model_loaded = pickle.load(f)

    # Load the pickled vectorizer
    with open('svm_vectorizer.pkl', 'rb') as f:
        svm_vectorizer_loaded = pickle.load(f)

    # Transform the new data with the SVM vectorizer
    list_svm_vectorized = svm_vectorizer_loaded.transform(list)

    # Predict the vectorized data with the SVM model
    svm_prediction = svm_model_loaded.predict(list_svm_vectorized)

    print("SVM Prediction done")
    
    ### BERT

    # Load the saved model
    bert_model_loaded = DistilBertForSequenceClassification.from_pretrained('./trained_model')
    bert_model_loaded.eval()
    
    # Load the saved tokenizer
    bert_tokenizer_loaded = BertTokenizerFast.from_pretrained('./trained_model')

    # Tokenize the new data
    list_bert_tokenized = bert_tokenizer_loaded(list, 
                                            padding=True, truncation=True, return_tensors="pt")

    list_bert_tokenized = {key: value for key, 
                       value in list_bert_tokenized.items() if key != 'token_type_ids'}

    # Get the model prediction as probability
    with torch.no_grad():
        bert_output = bert_model_loaded(**list_bert_tokenized)

    # Convert prediction probability to class label
    predictions = torch.argmax(bert_output.logits, dim=-1)
    bert_prediction = predictions.tolist()

    print("BERT Prediction done")
    
    ### Logistic Regression

    # Load the pickled model
    with open('lg_model.pkl', 'rb') as f:
        lg_model_loaded = pickle.load(f)

    # Don't need to transform with the vectorizer since it is included in the model pipeline

    # Predict the vectorized data with the LG model
    lg_prediction = lg_model_loaded.predict(list)

    print("LG Prediction done")
    
    ### Combining Predictions

    predictions = []

    for i in tqdm.tqdm(range(len(list)), desc="Processing Data", unit="item"):
        svm_i = svm_prediction[i]
        bert_i = bert_prediction[i]
        lg_i = lg_prediction[i]

        avg = (svm_i + bert_i + lg_i) / 3

        if avg > 0.5:
            conf = 100 * avg
            predictions.append((1, conf, [svm_i, bert_i, lg_i]))
        else:
            if avg == 0.0:
                conf = 100.0
            else:
                conf = 100 - (100 * avg)
            predictions.append((0, conf, [svm_i, bert_i, lg_i]))
    
    return predictions

In [68]:
# Previous preprocessing code

#train_pos = df['train'].filter(filter_positive_reviews)
#train_neg = df['train'].filter(filter_negative_reviews)
#test_pos = df['test'].filter(filter_positive_reviews)
#test_neg = df['test'].filter(filter_negative_reviews)

#train_pos_subset = train_pos.shuffle(seed=42).select(range(45000)) # 45,000 train label 1
#train_neg_subset = train_neg.shuffle(seed=42).select(range(45000)) # 45,000 train label 0
#test_pos_subset = test_pos.shuffle(seed=42).select(range(5000)) # 5,000 test label 1
#test_neg_subset = test_neg.shuffle(seed=42).select(range(5000)) # 5,000 test label 0

# Selecting 4000 values from the remaining data not used for train or test

full_pred_train_pos = train_pos.filter(lambda example, 
                                                     idx: idx not in train_pos_subset._indices, 
                                       with_indices=True)

full_pred_train_pos = full_pred_train_pos.shuffle(seed=44).select(range(1000))

full_pred_train_neg = train_neg.filter(lambda example, 
                                                     idx: idx not in train_neg_subset._indices, 
                                       with_indices=True)

full_pred_train_neg = full_pred_train_neg.shuffle(seed=44).select(range(1000))

full_pred_test_pos = test_pos.filter(lambda example, 
                                                     idx: idx not in test_pos_subset._indices, 
                                       with_indices=True)

full_pred_test_pos = full_pred_test_pos.shuffle(seed=44).select(range(1000))

full_pred_test_neg = test_neg.filter(lambda example, 
                                                     idx: idx not in test_neg_subset._indices, 
                                       with_indices=True)

full_pred_test_neg = full_pred_test_neg.shuffle(seed=44).select(range(1000))

# Combining all of the 4000 observations together for the data and labels
full_pred_train = full_pred_train_pos['text'] + full_pred_train_neg['text']
full_pred_test = full_pred_test_pos['text'] + full_pred_test_neg['text'] 
full_pred_data = full_pred_train + full_pred_test
full_pred_labels = full_pred_train_pos['label'] + full_pred_train_neg['label'] + full_pred_test_pos['label'] + full_pred_test_neg['label']

In [70]:
# Predicting on the 4000 observations using the previously made function
full_preds = sentiment_prediction(full_pred_data)

SVM Prediction done
BERT Prediction done
LG Prediction done


Processing Data: 100%|██████████| 4000/4000 [00:00<00:00, 916837.86item/s]


In [80]:
# Taking the function output to a data frame for analysis
full_preds_df = pd.DataFrame(full_preds, columns = ["Prediction Label", "Prediction Confidence", "Model Predictions"])

In [82]:
# Adding the true labels to the data frame
full_preds_df['Real Label'] = full_pred_labels

In [84]:
# Adding a column, indicating whether the final prediction was accurate or not
full_preds_df['Accurate (T/F)'] = full_preds_df['Prediction Label'] == full_preds_df['Real Label']

In [86]:
# Saving the data frame to do analysis in the next file Full Model Data Analysis.ipynb
full_preds_df.to_csv('conclusion_data.csv', index=False)