<a href="https://colab.research.google.com/github/mchanwa/COS424/blob/main/COS424_HW1Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing Code for Twitter Dataset

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
!pip install emoji

import pandas as pd
import numpy as np
import regex as re
import string
import nltk
import emoji

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english')) - {'all'}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [19]:
# Gets the part of speech tag of word for lemmatization
# This function is based on code from:
#   https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocesses the tweets text
# This function is based on code from:
#   https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
def preprocess_text(tweet):
    # Changes emojis to words
    tweet = emoji.demojize(tweet,  delimiters=(' ', ' '))
    # Removes 'RT' from tweet
    tweet = re.sub(r'RT[\s]+', '', tweet)
    # Removes capitalization
    tweet = tweet.lower()
    # Removes urls & user mentions from tweet
    tweet = re.sub(r"http\S+|www\S+|https\S+|\@\w+", ' ', tweet, flags=re.MULTILINE)
    # Removes punctuation
    tweet = re.sub(r'\p{P}+', '', tweet)
    # Removes stopwords
    tokens = [w for w in word_tokenize(tweet) if not w in stop_words]
    # Perfoms lemmatization on tokens
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    return " ".join(lemma_words)

# Preprocesses the text of the Tweets in the df and returns the df
# By default, this removes the Tweets with the "neither" label
def preprocess_df(df, remove_neither=True):
  idx = "text"
  length = len(df[idx])
  for ii in range(length):
    tweet = str(df[idx][ii])
    df.loc[ii, idx] = preprocess_text(tweet)
  if (remove_neither):
    return df[df['BLM'] != "neither"]
  else:
    return df

In [20]:
# Retrieves and preprocesses the training dataset
path = "/content/drive/MyDrive/Colab Notebooks/train.csv" # Path to train.csv
train_df = pd.read_csv(path)
train_df.fillna("", inplace=True) # fills any NaN values with empty strings
train_df = preprocess_df(train_df)
train_df.head(5)

Unnamed: 0,created_at,hashtags,text,BLM
0,2013-08-05,BlackLivesMatter BrownLivesMatter Every28Hours...,let talk state violence youth color blacklives...,positive
2,2013-08-30,blacklivesmatter,mt show kid positive image black people build ...,positive
3,2013-08-30,BlackLivesMatter BBW13,q1 big parent influence blacklivesmatter bbw13,positive
4,2013-08-30,BlackLivesMatter,a10 breastfeeding life love crucial beautiful ...,positive
5,2013-08-30,BlackLivesMatter BBW13,new people jumping cohosting blacklivesmatter ...,positive


In [21]:
# Uses a CountVectorizer to construct bag-of-words matrix
vectorizer = CountVectorizer() # Add a comment about the max_features & ngram_range parameters
# train_vocab is an 2d array of the vocab from the training dataset 
train_vocab = vectorizer.fit_transform(train_df['text']).toarray()
# train_vocab_df is a dataframe where the element ij is the number of times word j occurred in Tweet i
train_vocab_df = pd.DataFrame(train_vocab, columns=vectorizer.get_feature_names())
train_labels = train_df['BLM']
print(train_labels)

0       positive
2       positive
3       positive
4       positive
5       positive
          ...   
7227    positive
7228    positive
7229    positive
7230    positive
7231    positive
Name: BLM, Length: 6747, dtype: object


In [22]:
train_vocab_df.head()

Unnamed: 0,02125,03,07,10,100,1000,10000,100000,10003,1000th,100letterstomaketheworldbetter,100plus,100reasons,100yr,101,1010,101015,1024,1029,1030am,1032,10456,1047fm,105,109milelong,10mostfascinatingpeople,10pm,10th,10x,10yr,11,110,1100,110000,1110c,113014,1130am,117,11915,11am,...,yourewelcome,yourlifematters,yourseves,youth,youthdevelopment,youthisthetruth,youthviolence,youtube,youve,yoyo,yr,ystrday,yu,yung,yup,yuvette,zachary,zacharyhammond,zacharyhammonds,zapiro,zaria,zemirbegic,zero,zi,zilphia,zimbabwe,zimmerman,zind,zinnbookfest,zion,zionist,zipper,ziptied,zombie,zone,zone17,zubat,zurbanotorres,zwarte,zwartepiet
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
# Retrieves and preprocesses the test dataset
path = "/content/drive/MyDrive/Colab Notebooks/test.csv" # Path to Test_dataset.csv
test_df = pd.read_csv(path)
test_df.fillna("", inplace=True) # fills any NaN values with empty strings
test_df = preprocess_df(test_df)

In [24]:
# Uses the vocab from the training dataset to vectorize the test dataset
test_vocab = vectorizer.transform(test_df['text']).toarray()
# test_vocab_df is a dataframe where the element ij is the number of times word j
# occurred in Tweet i
test_vocab_df = pd.DataFrame(test_vocab, columns=vectorizer.get_feature_names())
test_labels = test_df['BLM']

In [25]:
print(f"Number of neither Tweets in training: {len(train_df[train_df['BLM'] == 'neither'])}")
print(f"Number of positive Tweets in training: {len(train_df[train_df['BLM'] == 'positive'])}")
print(f"Number of negative Tweets in training: {len(train_df[train_df['BLM'] == 'negative'])}")

Number of neither Tweets in training: 0
Number of positive Tweets in training: 5528
Number of negative Tweets in training: 1219


In [26]:
print(f"Number of neither Tweets in test: {len(test_df[test_df['BLM'] == 'neither'])}")
print(f"Number of positive Tweets in test: {len(test_df[test_df['BLM'] == 'positive'])}")
print(f"Number of negative Tweets in test: {len(test_df[test_df['BLM'] == 'negative'])}")

Number of neither Tweets in test: 0
Number of positive Tweets in test: 1383
Number of negative Tweets in test: 305


# Classification

In [27]:
train_labels = train_labels.replace('positive', 1)
train_labels = train_labels.replace('negative', 0)

test_labels = test_labels.replace('positive', 1)
test_labels = test_labels.replace('negative', 0)

train_labels_arr = np.array(train_labels).reshape(len(train_labels),)
test_labels_arr = np.array(test_labels).reshape(len(test_labels),)

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate

In [29]:
import statistics
import time

def cross_validation(classifier, numfolds):
  cv_results_accuracy = \
    cross_validate(classifier, train_vocab_df, train_labels_arr, cv=numfolds, scoring='accuracy')
  cv_results_recall = \
    cross_validate(classifier, train_vocab_df, train_labels_arr, cv=numfolds, scoring='recall')
  cv_results_f1 = \
    cross_validate(classifier, train_vocab_df, train_labels_arr, cv=numfolds, scoring='f1')
  cv_results_precision = \
    cross_validate(classifier, train_vocab_df, train_labels_arr, cv=numfolds, scoring='precision')

  print("accuracy cv: ", statistics.mean(cv_results_accuracy['test_score']))
  print("precision cv: ", statistics.mean(cv_results_precision['test_score']))
  print("recall cv: ", statistics.mean(cv_results_recall['test_score']))
  print("f1 cv: ", statistics.mean(cv_results_f1['test_score']))

In [30]:
# GaussianNB
gnb = GaussianNB()

start = time.time()
prediction = gnb.fit(train_vocab_df, train_labels_arr).predict(test_vocab_df)
end = time.time()

print("fit and predict time (GaussianNB): " + str(end-start))

print("Accuracy score Gaussian NB: ", gnb.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(gnb, 5)
print("10-fold")
cross_validation(gnb, 10)

fit and predict time (GaussianNB): 1.2691256999969482
Accuracy score Gaussian NB:  0.6741706161137441
Precision Score:  0.8748874887488749
F1 Score:  0.7794707297514033
Recall Score:  0.702819956616052
5-fold
accuracy cv:  0.6678446036844851
precision cv:  0.8662981229635603
recall cv:  0.703136982154108
f1 cv:  0.7756907434879029
10-fold
accuracy cv:  0.6727354654357621
precision cv:  0.8719077947958584
recall cv:  0.7040418533951831
f1 cv:  0.7785535542837063


In [31]:
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

start = time.time()
prediction = mnb.fit(train_vocab_df, train_labels_arr).predict(test_vocab_df)
end = time.time()

print("fit and predict time (MultinomialNB): " + str(end-start))

print("Accuracy score Multinomial NB: ", mnb.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(mnb, 5)
print("10-fold")
cross_validation(mnb, 10)

fit and predict time (MultinomialNB): 0.5606226921081543
Accuracy score Multinomial NB:  0.840047393364929
Precision Score:  0.8560460652591171
F1 Score:  0.9083503054989818
Recall Score:  0.9674620390455532
5-fold
accuracy cv:  0.795460450814046
precision cv:  0.8551301017782915
recall cv:  0.9039336240825444
f1 cv:  0.8784435151661955
10-fold
accuracy cv:  0.797088251456204
precision cv:  0.8594814112893662
recall cv:  0.8997697014964489
f1 cv:  0.8788065120712019


In [34]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

start = time.time()
lrclf = LogisticRegression().fit(train_vocab_df, train_labels_arr)
prediction = lrclf.predict(test_vocab_df)
end = time.time()

print("fit and predict time (Logistic Regression): " + str(end-start))

print("Accuracy score Logistic Regression: ", lrclf.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(lrclf, 5)
print("10-fold")
cross_validation(lrclf, 10)

fit and predict time (Logistic Regression): 13.235511541366577
Accuracy score Logistic Regression:  0.840047393364929
Precision Score:  0.8668424522083059
F1 Score:  0.906896551724138
Recall Score:  0.9508315256688359
5-fold
accuracy cv:  0.8259900612250501
precision cv:  0.8582722686789126
recall cv:  0.9435482313665485
f1 cv:  0.8986518521252143
10-fold
accuracy cv:  0.8286565556654577
precision cv:  0.8595245877065946
recall cv:  0.945539481615431
f1 cv:  0.9003336350895486


In [35]:
# MLP with logistic activiation
from sklearn.neural_network import MLPClassifier

mlp_logistic = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)

start = time.time()
mlp_logistic.fit(train_vocab_df, train_labels_arr)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1, solver='lbfgs', activation='logistic')
prediction = mlp_logistic.predict(test_vocab_df)
end = time.time()

print("fit and predict time (MLP with logistic activiation): " + str(end-start))

print("Accuracy score MLP: ", mlp_logistic.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(mlp_logistic, 5)
print("10-fold")
cross_validation(mlp_logistic, 10)

fit and predict time (MLP with logistic activiation): 6.605457067489624
Accuracy score MLP:  0.8193127962085308
Precision Score:  0.8193127962085308
F1 Score:  0.900683816346467
Recall Score:  1.0
5-fold
accuracy cv:  0.8193271284627845
precision cv:  0.8193271284627845
recall cv:  1.0
f1 cv:  0.900692451166004
10-fold
accuracy cv:  0.8193271788108583
precision cv:  0.8193271788108583
recall cv:  1.0
f1 cv:  0.900692454648104


In [36]:
# MLP with relu activation
from sklearn.neural_network import MLPClassifier

mlp_relu = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)

start = time.time()
mlp_relu.fit(train_vocab_df, train_labels_arr)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1, solver='lbfgs', activation='relu')
prediction = mlp_relu.predict(test_vocab_df)
end = time.time()

print("fit and predict time (MLP with relu activation): " + str(end-start))

print("Accuracy score MLP: ", mlp_relu.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(mlp_relu, 5)
print("10-fold")
cross_validation(mlp_relu, 10)

fit and predict time (MLP with relu activation): 6.506883382797241
Accuracy score MLP:  0.8193127962085308
Precision Score:  0.8193127962085308
F1 Score:  0.900683816346467
Recall Score:  1.0
5-fold
accuracy cv:  0.8193271284627845
precision cv:  0.8193271284627845
recall cv:  1.0
f1 cv:  0.900692451166004
10-fold
accuracy cv:  0.8193271788108583
precision cv:  0.8193271788108583
recall cv:  1.0
f1 cv:  0.900692454648104


In [None]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

dtclf = DecisionTreeClassifier(random_state=0)

start = time.time()
prediction = dtclf.fit(train_vocab_df, train_labels_arr).predict(test_vocab_df)
end = time.time()

print("fit and predict time (DecisionTree): " + str(end-start))

print("Accuracy score DecisionTreeClassifier: ", mlp_relu.score(test_vocab_df, test_labels_arr))
print("Precision Score: ", precision_score(test_labels_arr, prediction))
print("F1 Score: ", f1_score(test_labels_arr, prediction))
print("Recall Score: ", recall_score(test_labels_arr, prediction))

print("5-fold")
cross_validation(dtclf, 5)
print("10-fold")
cross_validation(dtclf, 10)

fit and predict time (DecisionTree): 147.91907000541687
Accuracy score DecisionTreeClassifier:  0.8193127962085308
Precision Score:  0.8806290207290922
F1 Score:  0.8856937455068297
Recall Score:  0.8908170643528561
5-fold
