# Load data

In [227]:
#target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
from nltk.tokenize import word_tokenize
import pandas as pd

df = pd.read_csv('training_data.csv', delimiter=',')

tweet_tokens = []
target = []

positive_row_count=0
negative_row_count=0
for row in df.values:
    if row[0]==0 and negative_row_count<50000:
        negative_row_count+=1
        tweet_tokens.append(word_tokenize(row[5]))
        target.append(row[0])
    elif row[0]==4 and positive_row_count<50000:
        positive_row_count+=1
        tweet_tokens.append(word_tokenize(row[5]))
        target.append(row[0])

print(len(tweet_tokens))
print(len(target))

100000
200000


# Removing Noise from the Data

In [211]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
    
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words and token not in ["...","http"]:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

# Clean the Positive and Negative Data

In [212]:
from nltk.corpus import stopwords

cleaned_tokens_list = []

stop_words = stopwords.words('english')

for tokens in tweet_tokens:
    cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
print(tweet_tokens[0])
print(cleaned_tokens_list[0])

['@', 'switchfoot', 'http', ':', '//twitpic.com/2y1zl', '-', 'Awww', ',', 'that', "'s", 'a', 'bummer', '.', 'You', 'shoulda', 'got', 'David', 'Carr', 'of', 'Third', 'Day', 'to', 'do', 'it', '.', ';', 'D']
['switchfoot', '//twitpic.com/2y1zl', 'awww', "'s", 'bummer', 'shoulda', 'get', 'david', 'carr', 'third', 'day']


# Determining Word Density

In [213]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(cleaned_tokens_list)

# Testing Word Density

In [214]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[("'s", 10848), ("n't", 10847), ('get', 10464), ('go', 9948), ("'m", 7822), ('good', 7218), ('day', 6551), ('work', 6102), ('..', 5871), ('like', 5030)]


In [215]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
all_features = tfidf.fit_transform(cleaned_tokens_list)

In [216]:
all_features

<100000x91848 sparse matrix of type '<class 'numpy.float64'>'
	with 783341 stored elements in Compressed Sparse Row format>

In [217]:
X_train, X_test, y_train, y_test = train_test_split(all_features, target, test_size=0.2, random_state=42)

In [218]:
X_train.shape

(80000, 91848)

In [220]:
multinomial_nb_classifier = MultinomialNB()
multinomial_nb_classifier.fit(X_train, y_train)

MultinomialNB()

In [221]:
y_pred = multinomial_nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76     10035
           4       0.78      0.70      0.74      9965

    accuracy                           0.75     20000
   macro avg       0.75      0.75      0.75     20000
weighted avg       0.75      0.75      0.75     20000



In [239]:
custom_tweet = "I am so happy"

tokens = word_tokenize(custom_tweet)
custom_tweet_tokens = []
custom_tweet_tokens.append(remove_noise(tokens, stop_words))
features =  tfidf.transform(custom_tweet_tokens)

multinomial_nb_classifier.predict(features)

array([4])

In [228]:
linear_svc_classifier = LinearSVC()
linear_svc_classifier.fit(X_train, y_train)


LinearSVC()

In [229]:
y_pred = linear_svc_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     10035
           4       0.75      0.77      0.76      9965

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.75     20000
weighted avg       0.76      0.76      0.75     20000



In [244]:
custom_tweet = "Things are about to change for you. May the overthinking, and the doubt exit your mind right now. May clarity replace confusion. May peace and calmness fill your life. You’ve been strong long enough, it’s time to start receiving your blessings. You deserve it."

tokens = word_tokenize(custom_tweet)
custom_tweet_tokens = []
custom_tweet_tokens.append(remove_noise(tokens, stop_words))
features =  tfidf.transform(custom_tweet_tokens)

linear_svc_classifier.predict(features)

array([4])