In [19]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import pandas as pd
import string
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
# download punctuation and stopwords from nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Janssen VR
[nltk_data]     2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Janssen VR
[nltk_data]     2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# load tweets_df and view
tweets_df = pd.read_csv("Resources/Tweets.csv")
tweets_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [22]:
# get dataframe ready for processing

# make sure the tweets in column "text" are strings
tweets_df['text'] = tweets_df['text'].astype('str')

# delete the unneccessary columns
tweets_df = tweets_df.drop(columns=["textID", "selected_text"])

In [23]:
tweets_df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [24]:
def process_tweets(tweet):
    tweet = tweet.lower()
    tweet = "".join(char for char in tweet if char not in string.punctuation)
    tokenize_tweet = word_tokenize(tweet)
    stopword = stopwords.words("english")
    tweet_wo_stop = [word for word in tokenize_tweet if word not in stopword]
    final_tweet = " ".join(tweet_wo_stop)
    return final_tweet

In [25]:
# process tweets using above function
tweets_df['text'] = tweets_df['text'].apply(lambda x: process_tweets(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons couldnt put releases already bought,negative


In [26]:
# transform the sentiment column into numbers
dict_sentiment = {'positive': 1, 'neutral': 0, 'negative': -1}
tweets_df['sentiment'] = tweets_df['sentiment'].apply(lambda x: dict_sentiment.get(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,0
1,sooo sad miss san diego,-1
2,boss bullying,-1
3,interview leave alone,-1
4,sons couldnt put releases already bought,-1


In [27]:
sentiment_count = tweets_df.sentiment.value_counts()
sentiment_count





 0    11118
 1     8582
-1     7781
Name: sentiment, dtype: int64

In [28]:
# assign X and y to the input and target columns
X = tweets_df['text']
y = tweets_df['sentiment']

In [29]:
# split the data into testing data and training data
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.6, train_size = 0.4, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [30]:
# transform the data into tfidf vectors
# fit the tfidf vectorizer on the training data to avoid bias

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Random Forest Classifier (RFC)
We are going to use the random forest classifier to predict the sentiment of a tweet - wethther the tweet's sentiment is positive, negative, or neutral.

In [31]:
from sklearn.ensemble import RandomForestClassifier

# bootstrap = true ensures that the entire dataset is used to build trees


# clf = RandomForestClassifier(n_estimators=200, random_state=0)
# clf = RandomForestClassifier()
clf = RandomForestClassifier(n_estimators=500, random_state=0, bootstrap=True)

clf.fit(X_train_tfidf, y_train)

In [32]:
print('Training Score: ', clf.score(X_train_tfidf, y_train))
print('Testing Score: ', clf.score(X_test_tfidf, y_test))

Training Score:  0.9976225133430373
Testing Score:  0.699752583321205


## RFC Confusion Matrix, Confusion Report, and Scores
We used trial and error to optomize the regression model changing by modifying the split , and then by modifying the RFC parameters 
1) 20/80 train / test  w/ RandomForestClassifier() -- default
2) 60/40 train / test  w/ RandomForestClassifier() -- default
3) DEFAULT train / test w/ RandomForestClassifier() -- default
1) 20/80 train / test w/ RandomForestClassifier(n_estimators=200, random_state=0)
2) 60/40 train / test w/ RandomForestClassifier(n_estimators=200, random_state=0)
3) DEFAULT train / test w/ RandomForestClassifier(n_estimators=200, random_state=0)
1) 20/80 train / test w/ RandomForestClassifier(n_estimators=500, random_state=0)
2) 60/40 train / test w/ RandomForestClassifier(n_estimators=500, random_state=0)
3) DEFAULT train / testw/ RandomForestClassifier(n_estimators=500, random_state=0)

The maximum level of accuracy was achieved using the default tts settings and limiting the number of trees (n_estimates) to 200



In [33]:
print(confusion_matrix(y_test.values, clf.predict(X_test_tfidf)))


[[1176  636  165]
 [ 321 2086  403]
 [  78  460 1546]]


In [34]:
print(classification_report(y_test.values, clf.predict(X_test_tfidf)))

              precision    recall  f1-score   support

          -1       0.75      0.59      0.66      1977
           0       0.66      0.74      0.70      2810
           1       0.73      0.74      0.74      2084

    accuracy                           0.70      6871
   macro avg       0.71      0.69      0.70      6871
weighted avg       0.70      0.70      0.70      6871



In [35]:
# Prediction v. Actual View Frame
predictions = clf.predict(X_test_tfidf)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
2617,-1,-1
8817,0,0
10545,0,0
5566,-1,1
3310,0,-1
...,...,...
14344,0,0
797,-1,-1
10744,1,0
5658,1,1


## Extreme Random Trees

Extreme Random Trees did not provde a notable increase in the accuracy of the testing

In [36]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

# clf = ExtraTreesClassifier()
clf = ExtraTreesClassifier(random_state=0)

clf.fit(X_train_tfidf, y_train)

In [37]:
print('Training Score: ', clf.score(X_train_tfidf, y_train))
print('Testing Score: ', clf.score(X_test_tfidf, y_test))

Training Score:  0.9976225133430373
Testing Score:  0.7000436617668462


In [38]:
print(classification_report(y_test.values, clf.predict(X_test_tfidf)))

              precision    recall  f1-score   support

          -1       0.72      0.63      0.67      1977
           0       0.67      0.72      0.69      2810
           1       0.73      0.75      0.74      2084

    accuracy                           0.70      6871
   macro avg       0.71      0.70      0.70      6871
weighted avg       0.70      0.70      0.70      6871

