In [None]:
# Max Todd
# Applied Machine Learning

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from time import time

In [None]:
# Load the processed dataset (get from running preprocessing.py)
traindf = pd.read_csv('./data/preprocessed/train.csv',
                      encoding='unicode_escape')
testDF = pd.read_csv('./data/preprocessed/test.csv',
                     encoding='unicode_escape')

In [None]:
# remove nan
traindf.dropna(inplace=True)
testDF.dropna(inplace=True)

In [None]:
# Create storage for results
models = [
            'NB with NTD',
            'NB with TTD',
            'SVM with NTD',
            'SVM with TTD'
         ]

accuracies = [0 for i in range(len(models))]
trainTime = [0 for i in range(len(models))]

Naive Bayes with normalized text data
Accuracy = 0.6293984108967083

In [None]:
# Start timing
train = time()

# Make and train Naive Bayes model using the normalized text without stop words
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(traindf['normalizeNoStop'], traindf['sentiment'])

# Stop timing
train = time() - train

# Add time to results
trainTime[0] = train

In [1]:
# Predict
predicted = model.predict(testDF['normalizeNoStop'])

# Get the accuracy using the normalized text without stop words
accuray = accuracy_score(predicted, testDF['sentiment'])
print(f'NB with normalized accuracy={accuray}')
print(f'time: {train * 1000} ms')

# Add accuracy to results
accuracies[0] = accuray

NB with normalized accuracy=0.6293984108967083
time: 285.4340076446533 ms


Naive Bayes with tokenized text data
Accuracy = 0.6285471055618616

In [None]:
# Start timing
train = time()

# Make and train Naive Bayes model using the tokenized text
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(traindf['tokenized'], traindf['sentiment'])

# Stop timing
train = time() - train

# Add time to results
trainTime[1] = train

In [2]:
# Predict
predicted = model.predict(testDF['tokenized'])

# Get the accuracy using the tokenized text
accuray = accuracy_score(predicted, testDF['sentiment'])
print(f'NB with tokenized accuracy={accuray}')
print(f'time: {train * 1000} ms')

# Add accuracy to results
accuracies[1] = accuray

NB with tokenized accuracy=0.6285471055618616
time: 291.6879653930664 ms


SVM with normalized text data
Accuracy = 0.7017593643586834

In [None]:
# Start timing
train = time()

# Make and train Naive Bayes model using the normalized text without stop words
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(traindf['normalizeNoStop'], traindf['sentiment'])

# Stop timing
train = time() - train

# Add time to results
trainTime[2] = train

In [3]:
# Predict
predicted = model.predict(testDF['normalizeNoStop'])

# Get the accuracy using the tokenized text
accuray = accuracy_score(predicted, testDF['sentiment'])
print(f'SVM with normalized text data accuracy = {accuray}')
print(f'time: {train * 1000} ms')

# Add accuracy to results
accuracies[2] = accuray

SVM with normalized text data accuracy = 0.7017593643586834
time: 130257.7600479126 ms


SVM with tokenized text data
Accuracy = 0.7023269012485811

In [None]:
# Start timing
train = time()

# Make and train Naive Bayes model using the normalized text without stop words
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(traindf['tokenized'], traindf['sentiment'])

# Stop timing
train = time() - train

# Add time to results
trainTime[3] = train

In [4]:
# Predict
predicted = model.predict(testDF['tokenized'])

# Get the accuracy using the tokenized text
accuray = accuracy_score(predicted, testDF['sentiment'])
print(f'SVM with tokenized text data = {accuray}')
print(f'time: {train * 1000} ms')

# Add accuracy to results
accuracies[3] = accuray

SVM with tokenized text data = 0.7023269012485811
time: 135365.88191986084 ms
