In [1]:
# Import dependencies
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

In [3]:
# Importing the test database
twitter_df = pd.read_csv("../res/final_training_dataset.csv")
twitter_df

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment
0,0,1.587817e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun
1,1,1.587817e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun
2,2,1.587817e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun
3,3,1.587817e+18,i just won’t be celebrating gun violence killi...,anti-gun
4,4,1.587817e+18,mixed feelings about someone who is a r4pist g...,anti-gun
...,...,...,...,...
995,995,1.590000e+18,"@cbssaturday I am Dr. Floyd Jones, https://t.c...",neutral
996,996,1.590000e+18,Future artist Tray Tray video shoot shot up in...,neutral
997,997,1.590000e+18,Manhunt suspect in quadruple Aurora shooting t...,neutral
998,998,1.590000e+18,"@LogicIsLeaving @phike9391 @TMZ no, i don‚Äôt....",neutral


In [4]:
# Function to clean the database
def preprocess_tweet(tweet):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    tweet = tweet.lower()

    # Remove special characters
    tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet)
    
    # Remove repeating character
    tweet = re.sub(r'(.)1+', r'1', tweet)

    # Single character removal
    tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', tweet)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    tweet = re.sub(r'\s+', ' ', tweet)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove URL's
    tweet = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',tweet)
    
    #Replace 2a|2nd amendment to second amendment
    tweet = re.sub("2a|2nd\samendment|2nd", 'second amendment', tweet)
    
    # Remove numbers
    tweet = re.sub('[0-9]+', '', tweet)
    
    return tweet

In [5]:
# Save cleaned tweets in new cleaned column
cleaned_tweets = []

for tweet in twitter_df['full_text']:
  cleaned_tweet = preprocess_tweet(tweet)
  cleaned_tweets.append(cleaned_tweet)

twitter_df['cleaned'] = pd.DataFrame(cleaned_tweets)
twitter_df.head(10)

Unnamed: 0.1,Unnamed: 0,tweet_id,full_text,sentiment,cleaned
0,0,1.587817e+18,@twk_5 @davidhogg111 Good question. The guns a...,anti-gun,good question the guns and rifles you used w...
1,1,1.587817e+18,@NikaOneDay @thegreatunkn @obiwill_kenobi @Tul...,anti-gun,kenobi personally d rather have time machine ...
2,2,1.587817e+18,Just…read this. \nhttps://t.co/TfKqT2nNZI\n\n@...,anti-gun,just read this on the murder of isabella thall...
3,3,1.587817e+18,i just won’t be celebrating gun violence killi...,anti-gun,i just won be celebrating gun violence killing...
4,4,1.587817e+18,mixed feelings about someone who is a r4pist g...,anti-gun,mixed feelings about someone who is rpist gett...
5,5,1.587817e+18,@MuricanTechy @kellydoddchula @MisogynyManaged...,anti-gun,stakkz in fact we are but until we finish how...
6,6,1.587817e+18,@TomCottonAR Are you suggesting more guns like...,anti-gun,are you suggesting more guns like your collea...
7,7,1.587817e+18,@GhostofTST Disagreed! You can have sensible g...,anti-gun,disagreed you can have sensible gun laws or y...
8,8,1.587817e+18,@AbbottCampaign @GregAbbott_TX Right. Instead...,anti-gun,tx right instead tx will let you freeze keep ...
9,9,1.587817e+18,"Rest in Power, Takeoff. \n\nSo sad to see anot...",anti-gun,rest in power takeoff so sad to see another vi...


In [6]:
# Drop column text
twitter_df = twitter_df.drop(['Unnamed: 0','tweet_id', 'full_text'], axis=1)
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question the guns and rifles you used w...
1,anti-gun,kenobi personally d rather have time machine ...
2,anti-gun,just read this on the murder of isabella thall...
3,anti-gun,i just won be celebrating gun violence killing...
4,anti-gun,mixed feelings about someone who is rpist gett...
...,...,...
995,neutral,am dr floyd jones saw chad lawson on the satu...
996,neutral,future artist tray tray video shoot shot up in...
997,neutral,manhunt suspect in quadruple aurora shooting t...
998,neutral,no don i wish could stop all gun violence tbh...


In [7]:
neutral_df = twitter_df[(twitter_df['sentiment'] == 'neutral')].index
neutral_df

Int64Index([200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
            ...
            990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
           dtype='int64', length=200)

In [8]:
twitter_df.drop(neutral_df, inplace=True)
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question the guns and rifles you used w...
1,anti-gun,kenobi personally d rather have time machine ...
2,anti-gun,just read this on the murder of isabella thall...
3,anti-gun,i just won be celebrating gun violence killing...
4,anti-gun,mixed feelings about someone who is rpist gett...
...,...,...
945,pro-gun,followed all rules of gun safety was in liter...
946,pro-gun,the right to bear arms is actually in the con...
947,pro-gun,debunking joe biden anti gun conversation with...
948,pro-gun,madison argued that state militias would be a...


In [9]:
# Remove stopwords
import nltk
nltk.download('stopwords')
stopwordlist = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/keertichaudhary/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Cleaning and removing the above stop words list from the tweet text
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda text: cleaning_stopwords(text))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,good question guns rifles used assault weapons...
1,anti-gun,kenobi personally rather time machine could go...
2,anti-gun,read murder isabella thallas denver mundanity ...
3,anti-gun,celebrating gun violence killing black folks e...
4,anti-gun,mixed feelings someone rpist getting murdered ...


In [11]:
# Getting tokenization of tweet text
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
twitter_df['cleaned'] = twitter_df['cleaned'].apply(tokenizer.tokenize)
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [12]:
# Applying Stemming
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: stemming_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [13]:
# Applying Lemmatizer
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: lemmatizer_on_text(x))
twitter_df.head()

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, question, guns, rifles, used, assault, ..."
1,anti-gun,"[kenobi, personally, rather, time, machine, co..."
2,anti-gun,"[read, murder, isabella, thallas, denver, mund..."
3,anti-gun,"[celebrating, gun, violence, killing, black, f..."
4,anti-gun,"[mixed, feelings, someone, rpist, getting, mur..."


In [14]:
# Removing words with less frequency
# filter function to select only the words with more than 10 counts and less than 800.
import itertools
flat_list = list(itertools.chain.from_iterable(twitter_df['cleaned']))

fd = nltk.FreqDist(flat_list)
word_to_keep = list(filter(lambda x: 800>x[1]>10, fd.items()))

word_list_to_keep= [item[0] for item in word_to_keep]

def remove_lessfreq(tokanized_tweets):
    text_out = [word for word in tokanized_tweets if word in word_list_to_keep]
    return text_out

In [15]:
twitter_df['cleaned'] = twitter_df['cleaned'].apply(lambda x: remove_lessfreq(x))
twitter_df

Unnamed: 0,sentiment,cleaned
0,anti-gun,"[good, guns, rifles, used, assault, weapons, p..."
1,anti-gun,"[time, could, go, prevent, guns, first, place,..."
2,anti-gun,"[murder, gun, violence, us]"
3,anti-gun,"[gun, violence, killing, black]"
4,anti-gun,"[someone, murdered, people, gun, violence, man..."
...,...,...
945,pro-gun,"[gun, safety]"
946,pro-gun,"[right, bear, arms, actually, constitution, se..."
947,pro-gun,"[joe, anti, gun, young, gun, control, joe, gun..."
948,pro-gun,"[state, would, second, amendment, sure, nothing]"


In [16]:
# Separating input feature and label
X=twitter_df.cleaned
y=twitter_df.sentiment

In [17]:
# Splitting our dataset into Train and Test Subset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
# Transforming Dataset using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df = 5, ngram_range=(1,4))
vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x)))


# # Cleaning data in single line through passing clean_text in the CountVectorizer
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer(min_df = 5, ngram_range=(1,4)) 
# countVector = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(x)))
# print(countVector.shape)

<600x332 sparse matrix of type '<class 'numpy.float64'>'
	with 5771 stored elements in Compressed Sparse Row format>

In [19]:
X_train = vectorizer.transform(X_train.apply(lambda x: ' '.join(x)))
X_test  = vectorizer.transform(X_test.apply(lambda x: ' '.join(x)))

## 1. Balanced Random Forest Classifier

In [20]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators =130, random_state=1)
brf_model.fit(X_train, y_train)
y_pred = brf_model.predict(X_test)

In [21]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.7443473017787157

In [22]:
from sklearn.metrics import balanced_accuracy_score, classification_report
print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.69      0.81      0.74        93
     pro-gun       0.80      0.68      0.74       107

    accuracy                           0.74       200
   macro avg       0.75      0.74      0.74       200
weighted avg       0.75      0.74      0.74       200



## 2. Easy Ensemble Classifier

In [23]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec_model = EasyEnsembleClassifier(n_estimators =50, random_state=1)
eec_model.fit(X_train, y_train)
y_pred2 = eec_model.predict(X_test)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred2)

0.6859109637222389

In [25]:
from sklearn.metrics import balanced_accuracy_score, classification_report
print("Classification Report")
print(classification_report(y_test, y_pred2))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.65      0.70      0.67        93
     pro-gun       0.72      0.67      0.70       107

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.68       200
weighted avg       0.69      0.69      0.69       200



## 3. Bernoulli Naive Bayes

In [26]:
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train, y_train)
y_pred3 = BNBmodel.predict(X_test)

In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred3)

0.7342980604964324

In [28]:
from sklearn.metrics import balanced_accuracy_score, classification_report
print("Classification Report")
print(classification_report(y_test, y_pred3))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.68      0.80      0.73        93
     pro-gun       0.79      0.67      0.73       107

    accuracy                           0.73       200
   macro avg       0.74      0.73      0.73       200
weighted avg       0.74      0.73      0.73       200



## 5. Multinomial Naive Bayes

In [29]:
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
cnb_classifier = MultinomialNB()

# Fit the classifier to the training data
cnb_classifier.fit(X_train, y_train)
y_pred5 = cnb_classifier.predict(X_test)

In [30]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred5)

0.6987739925635614

In [31]:
from sklearn.metrics import balanced_accuracy_score, classification_report
print("Classification Report")
print(classification_report(y_test, y_pred5))

Classification Report
              precision    recall  f1-score   support

    anti-gun       0.65      0.75      0.70        93
     pro-gun       0.75      0.64      0.69       107

    accuracy                           0.69       200
   macro avg       0.70      0.70      0.69       200
weighted avg       0.70      0.69      0.69       200

