In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Importing the test database
twitter_df = pd.read_csv("../res/final_training_dataset.csv")
twitter_df

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment
0,0,1.587817e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun
1,1,1.587817e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun
2,2,1.587817e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun
3,3,1.587817e+18,i just won’t be celebrating gun violence killi...,anti-gun
4,4,1.587817e+18,mixed feelings about someone who is a r4pist g...,anti-gun
...,...,...,...,...
995,995,1.590000e+18,"@cbssaturday I am Dr. Floyd Jones, https://t.c...",neutral
996,996,1.590000e+18,Future artist Tray Tray video shoot shot up in...,neutral
997,997,1.590000e+18,Manhunt suspect in quadruple Aurora shooting t...,neutral
998,998,1.590000e+18,"@LogicIsLeaving @phike9391 @TMZ no, i don‚Äôt....",neutral


In [5]:
# Function to clean the database
def preprocess_tweet(tweet):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    tweet = tweet.lower()

    # Remove RT
    sentence = re.sub('RT @\w+: '," ", tweet)

    # Remove special characters
    tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet)

    # Single character removal
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove URL's
    tweet = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',tweet)
    
    #Replace 2a|2nd amendment to second amendment
    tweet = re.sub("2a|2nd\samendment|2nd|secondamendment", 'second amendment', tweet)
    
    # Remove numbers
    tweet = re.sub('[0-9]+', '', tweet)
    
    return tweet

In [6]:
# Save cleaned tweets in new cleaned column
cleaned_tweets = []

for tweet in twitter_df['full_text']:
  cleaned_tweet = preprocess_tweet(tweet)
  cleaned_tweets.append(cleaned_tweet)

twitter_df['cleaned'] = pd.DataFrame(cleaned_tweets)
twitter_df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment,cleaned
0,0,1.587817e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun,good question the guns and rifles you used w...
1,1,1.587817e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun,kenobi personally d rather have time machine ...
2,2,1.587817e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun,just read this on the murder of isabella thall...
3,3,1.587817e+18,i just won’t be celebrating gun violence killi...,anti-gun,i just won be celebrating gun violence killing...
4,4,1.587817e+18,mixed feelings about someone who is a r4pist g...,anti-gun,mixed feelings about someone who is rpist gett...
5,5,1.587817e+18,@MuricanTechy @kellydoddchula @MisogynyManaged...,anti-gun,stakkz in fact we are but until we finish how...
6,6,1.587817e+18,@TomCottonAR Are you suggesting more guns like...,anti-gun,are you suggesting more guns like your collea...
7,7,1.587817e+18,@GhostofTST Disagreed! You can have sensible g...,anti-gun,disagreed you can have sensible gun laws or y...
8,8,1.587817e+18,@AbbottCampaign @GregAbbott_TX Right. Instead...,anti-gun,tx right instead tx will let you freeze keep ...
9,9,1.587817e+18,"Rest in Power, Takeoff. \n\nSo sad to see anot...",anti-gun,rest in power takeoff so sad to see another vi...


In [7]:
# Drop column text
twitter_df = twitter_df.drop(['Unnamed: 0','tweet_id', 'full_text'], axis=1)
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question the guns and rifles you used w...
1,anti-gun,kenobi personally d rather have time machine ...
2,anti-gun,just read this on the murder of isabella thall...
3,anti-gun,i just won be celebrating gun violence killing...
4,anti-gun,mixed feelings about someone who is rpist gett...
...,...,...
995,neutral,am dr floyd jones saw chad lawson on the satu...
996,neutral,future artist tray tray video shoot shot up in...
997,neutral,manhunt suspect in quadruple aurora shooting t...
998,neutral,no don i wish could stop all gun violence tbh...


In [8]:
# Remove stopwords
import nltk
nltk.download('stopwords')
stopwordlist = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keertichaudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Cleaning and removing the above stop words list from the tweet text
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda text: cleaning_stopwords(text))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question guns rifles used assault weapons...
1,anti-gun,kenobi personally rather time machine could go...
2,anti-gun,read murder isabella thallas denver mundanity ...
3,anti-gun,celebrating gun violence killing black folks e...
4,anti-gun,mixed feelings someone rpist getting murdered ...


In [10]:
# Getting tokenization of tweet text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
twitter_df['cleaned'] = twitter_df['cleaned'].apply(tokenizer.tokenize)
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [11]:
# Applying Stemming
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: stemming_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [12]:
# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: lemmatizer_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [13]:
# Removing words with less frequency
# filter function to select only the words with more than 10 counts and less than 800.
import itertools
flat_list = list(itertools.chain.from_iterable(twitter_df['cleaned']))

fd = nltk.FreqDist(flat_list)
word_to_keep = list(filter(lambda x: 800>x[1]>10, fd.items()))

word_list_to_keep= [item[0] for item in word_to_keep]

def remove_lessfreq(tokanized_tweets):
    text_out = [word for word in tokanized_tweets if word in word_list_to_keep]
    return text_out

In [14]:
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: remove_lessfreq(x))
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, guns, rifles, used, assault, weapons, p..."
1,anti-gun,"[time, could, go, prevent, guns, first, place,..."
2,anti-gun,"[read, murder, gun, violence, us]"
3,anti-gun,"[gun, violence, killing, black]"
4,anti-gun,"[someone, getting, murdered, people, gun, viol..."
...,...,...
995,neutral,"[gun, violence, community, come]"
996,neutral,"[shot, chicago, shooting]"
997,neutral,"[shooting, say]"
998,neutral,"[could, stop, gun, violence, think, know, act,..."


In [15]:
# Separating input feature and label
X=twitter_df.cleaned
y=twitter_df.sentiment

In [16]:
# Splitting our dataset into Train and Test Subset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Transforming Dataset using TF-IDF Vectorizer
# “min_df = 5” parameter means ignore terms that appear in fewer than five documents.
vectorizer = TfidfVectorizer(min_df = 5, ngram_range=(1,4))  #max_df = 90. ignores terms that appear in more than 90% of the documents
vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x)))

<750x393 sparse matrix of type '<class 'numpy.float64'>'
	with 6974 stored elements in Compressed Sparse Row format>

In [18]:
X_train = vectorizer.transform(X_train.apply(lambda x: ' '.join(x)))
X_test  = vectorizer.transform(X_test.apply(lambda x: ' '.join(x)))

# ML Models to consider for twitter sentiment analysis
## 1. Balanced Random Forest Classifier

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=25, random_state=1)
brf_model.fit(X_train, y_train)
y_pred = brf_model.predict(X_test)

In [20]:
pd.DataFrame({"Predictions": y_pred, "Actual": y_test})

Unnamed: 0,Predictions,Actual
507,anti-gun,anti-gun
818,anti-gun,anti-gun
452,anti-gun,neutral
368,pro-gun,pro-gun
242,anti-gun,neutral
...,...,...
385,pro-gun,pro-gun
890,pro-gun,pro-gun
439,pro-gun,pro-gun
135,pro-gun,pro-gun


In [21]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score, classification_report
balanced_accuracy_score(y_test, y_pred)

0.6244444444444445

In [22]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.64      0.54      0.58       100
     neutral       0.51      0.70      0.59        60
     pro-gun       0.69      0.63      0.66        90

    accuracy                           0.61       250
   macro avg       0.61      0.62      0.61       250
weighted avg       0.62      0.61      0.61       250



## 2. Easy Ensemble Classifier

In [23]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =50, random_state=1)
eec_model.fit(X_train, y_train)
y_pred2 = eec_model.predict(X_test)

In [25]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred2)

0.5862962962962963

In [26]:
print("Classification Report")
print(classification_report(y_test, y_pred2))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.70      0.52      0.60       100
     neutral       0.48      0.55      0.51        60
     pro-gun       0.58      0.69      0.63        90

    accuracy                           0.59       250
   macro avg       0.59      0.59      0.58       250
weighted avg       0.60      0.59      0.59       250



## 3. Bernoulli Naive Bayes

In [27]:
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train, y_train)
y_pred3 = BNBmodel.predict(X_test)

In [29]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred3)

0.5896296296296296

In [30]:
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.58      0.63      0.61       100
     neutral       0.58      0.52      0.55        60
     pro-gun       0.63      0.62      0.63        90

    accuracy                           0.60       250
   macro avg       0.60      0.59      0.59       250
weighted avg       0.60      0.60      0.60       250



## 4. Multinomial NB

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
cnb_classifier = MultinomialNB()

# Fit the classifier to the training data
cnb_classifier.fit(X_train, y_train)
y_pred4 = cnb_classifier.predict(X_test)

In [34]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred4)

0.5914814814814815

In [35]:
print("Classification Report")
print(classification_report(y_test, y_pred4))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.60      0.63      0.61       100
     neutral       0.80      0.40      0.53        60
     pro-gun       0.58      0.74      0.65        90

    accuracy                           0.62       250
   macro avg       0.66      0.59      0.60       250
weighted avg       0.64      0.62      0.61       250

