# Roseanne Tweets Anlaysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

## Read the csv file

In [2]:
df = pd.read_csv(filepath_or_buffer = "finalDataV1.csv")

In [3]:
df

Unnamed: 0,Code,Tweet
0,Neither,RoseanneBarr
1,Neither,@iAmLoraG @therealroseanne that's my ted talk....
2,Anti-Roseanne,@axios @therealroseanne @realDonaldTrump one r...
3,Anti-Roseanne,@therealroseanne Bye!
4,Anti-Roseanne,@JustinUtley @Stefanoooch @cnnbrk Roseanne del...
5,Pro-Roseanne,.@therealroseanne they were looking for a reas...
6,Anti-Roseanne,@POTUS_Don45 @Melissa89431446 @therealroseanne...
7,Anti-Roseanne,Yayyyyy! Buh-bye @therealroseanne
8,Neither,Robert Iger on the cancellation of 'Roseanne' ...
9,Neither,@ryan90_chris @FoxNews @KevinCate @dailybriefi...


## Import stop words

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mrinal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Remove punctuation, special characters and stop words from the tweets

In [5]:
corpus = []

for i in range(0, 5876):
    tweets = re.sub('[^a-zA-Z]', ' ', df['Tweet'][i])
    tweets = tweets.lower()
    tweets = tweets.split()
    ps = PorterStemmer()
    tweets = [ps.stem(words) for words in tweets if not words in stopwords.words('english')]
    tweets = ' '.join(tweets)
    corpus.append(tweets)

## Convert the Xs to a matrix where each column is a word from the tweet and the row values represent the word count that appears in the particular tweet 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = df.iloc[ : , 0].values

In [7]:
X.shape

(5876, 9290)

In [8]:
y

array(['Neither', 'Neither', 'Anti-Roseanne', ..., 'Neither', 'Neither',
       'Pro-Roseanne'], dtype=object)

In [9]:
y.shape

(5876,)

## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, 
                                                    random_state = 0)



## Apply Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB(priors=None)

In [12]:
y_pred = nb.predict(X_test)
y_pred_prob = nb.predict_proba(X_test) [: , 1]


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[292,  92,  38],
       [ 89, 451,  21],
       [ 53,  38, 102]], dtype=int64)

In [14]:
from sklearn.metrics import classification_report

In [15]:
classification_report(y_test, y_pred)

'               precision    recall  f1-score   support\n\nAnti-Roseanne       0.56      0.49      0.52       422\n      Neither       0.67      0.22      0.34       561\n Pro-Roseanne       0.22      0.70      0.34       193\n\n  avg / total       0.56      0.40      0.40      1176\n'

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.71853741496598644

## Apply Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 500, max_features = .25, criterion='entropy' )
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.25, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test) [: , 1]

In [19]:
from sklearn.metrics import confusion_matrix,accuracy_score,auc
confusion_matrix(y_test, y_pred)

array([[287, 116,  19],
       [ 81, 460,  20],
       [ 52,  53,  88]], dtype=int64)

In [20]:
accuracy_score(y_test, y_pred)

0.71003401360544216

In [21]:
classification_report(y_test, y_pred)

'               precision    recall  f1-score   support\n\nAnti-Roseanne       0.68      0.68      0.68       422\n      Neither       0.73      0.82      0.77       561\n Pro-Roseanne       0.69      0.46      0.55       193\n\n  avg / total       0.71      0.71      0.70      1176\n'

## Apply Light GBM


In [22]:
from lightgbm.sklearn import LGBMClassifier
lgb = LGBMClassifier(learning_rate = .01, max_depth = 3, max_features = "log2" ,loss = 'deviance', n_estimators = 500)
lgb.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.01, loss='deviance', max_depth=3,
        max_features='log2', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=500, n_jobs=-1, num_leaves=31,
        objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [23]:
y_pred = lgb.predict(X_test)
y_pred_prob = lgb.predict_proba(X_test) [: , 1]

In [24]:
confusion_matrix(y_test, y_pred)

array([[205, 207,  10],
       [ 32, 520,   9],
       [ 45, 104,  44]], dtype=int64)

In [25]:
accuracy_score(y_test, y_pred)

0.65391156462585032

## Apply logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [27]:
log = LogisticRegression(C =1.5, penalty='l1')
log.fit(X_train, y_train)

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
y_pred = log.predict(X_test)
y_pred_prob = log.predict_proba(X_test) [: , 1]

In [29]:
confusion_matrix(y_test, y_pred)

array([[286, 116,  20],
       [ 56, 482,  23],
       [ 45,  54,  94]], dtype=int64)

In [30]:
accuracy_score(y_test, y_pred)

0.73299319727891155

## SVC

In [13]:
svc = SVC(C = 1, kernel = 'poly', coef0=10.0,gamma=.1,probability = True,random_state= 1)
svc.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=10.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test) [: , 1]

In [17]:
confusion_matrix(y_test, y_pred)

array([[292,  92,  38],
       [ 89, 451,  21],
       [ 53,  38, 102]], dtype=int64)

In [20]:
accuracy_score(y_test, y_pred)

0.71853741496598644