In [1]:
# import required libraries
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import string

In [2]:
# download punctuation and stopwords from nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\apfle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# load tweets_df and view
tweets_df = pd.read_csv("Resources/Tweets.csv")
tweets_df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [4]:
# get dataframe ready for processing

# make sure the tweets in column "text" are strings
tweets_df['text'] = tweets_df['text'].astype('str')

# delete the unneccessary columns
tweets_df = tweets_df.drop(columns=["textID", "selected_text"])

In [5]:
def process_tweets(tweet):
    tweet = tweet.lower()
    tweet = "".join(char for char in tweet if char not in string.punctuation)
    tokenize_tweet = word_tokenize(tweet)
    stopword = stopwords.words("english")
    tweet_wo_stop = [word for word in tokenize_tweet if word not in stopword]
    final_tweet = " ".join(tweet_wo_stop)
    return final_tweet

In [6]:
# process tweets using above function
tweets_df['text'] = tweets_df['text'].apply(lambda x: process_tweets(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,neutral
1,sooo sad miss san diego,negative
2,boss bullying,negative
3,interview leave alone,negative
4,sons couldnt put releases already bought,negative


In [7]:
# transform the sentiment column into numbers
dict_sentiment = {'positive': 1, 'neutral': 0, 'negative': -1}
tweets_df['sentiment'] = tweets_df['sentiment'].apply(lambda x: dict_sentiment.get(x))

# view updated dataframe
tweets_df.head()

Unnamed: 0,text,sentiment
0,id responded going,0
1,sooo sad miss san diego,-1
2,boss bullying,-1
3,interview leave alone,-1
4,sons couldnt put releases already bought,-1


In [8]:
tweets_df = tweets_df.dropna()

In [9]:
# tweet

In [10]:
# assign X and y to the input and target columns
X = tweets_df['text']
y = tweets_df['sentiment']

In [11]:
# split the data into testing data and training data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
# transform the data into tfidf vectors
# # fit the tfidf vectorizer on the training data to avoid bias

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [13]:
X_train

15053                    pooof nirvana work httpmylocmetdl
9636                             doesnt understand twitter
1426                            theres point bein one else
19617    going gym early trying get back shape husb got...
12775    way bought today aswell twice cos pre ordered ...
                               ...                        
24496    im looking forward going home tomorrow really ...
26564                           jobos tired getting work 7
20847                      kinda forgot much love darkness
8826                          love hot policemen come work
218                                             ew traffic
Name: text, Length: 20610, dtype: object

## Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
y_pred = mnb.fit(X_train_tfidf, y_train).predict(X_test_tfidf)
# tokens_list = []
# for tweet in tweets_df["text"]:
#   tokens_list.append(tweet)
# tokens = set()
# for element in tokens_list:
#   for word in element:
#     tokens.add(word)

In [21]:
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# gnb.fit(X_train_tfidf.todense(), y_train)
# print(metrics.accuracy_score(gnb.predict(X_test_tfidf), y_test))

In [18]:
from sklearn import metrics
accuracy_score = metrics.accuracy_score(y_pred, y_test)

In [19]:
accuracy_score

0.6115558142919517

In [None]:
# # look at the scores for the testing and training data
# print(f"Training Data Score: {model.score(X_train_tfidf.toarray(), y_train)}")
# print(f"Testing Data Score: {model.score(X_test_tfidf.toarray(), y_test)}")

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix

# # find metrics for testing data
# print(confusion_matrix(y_test.values, model.predict(X_test_tfidf.toarray())))
# print(classification_report(y_test.values, model.predict(X_test_tfidf.toarray())))