In [0]:
import pandas as pd
import numpy as np
data = pd.read_csv('dataset.csv')

In [428]:
import nltk
nltk.download('stopwords')

#Making all letters lowercase
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#removing links
data['text'] = data['text'].str.replace('http:(\S*)','')

#Removing Punctuation, Symbols
data['text'] = data['text'].str.replace('[^\w\s]',' ')

#Removing Stop Words using NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [429]:
nltk.download('wordnet')

#Stemming
from textblob import Word
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).stem() for word in x.split()]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#Code to find the top 5000 rarest words appearing in the data
freq = pd.Series(' '.join(data['text']).split()).value_counts()[-5000:]

#Removing all those rarely appearing words from the data
freq = list(freq.index)
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [431]:
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.mood.values)

lbl_mood_mapping = dict(zip(lbl_enc.classes_, lbl_enc.transform(lbl_enc.classes_)))
print(lbl_mood_mapping)

{'anger': 0, 'happiness': 1, 'sadness': 2}


In [0]:
# Splitting into training and testing data in 90:10 ratio
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data.text.values, y, test_size=0.1, shuffle=True)

In [0]:
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word',ngram_range=(1,3))
count_vect.fit(data['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [434]:
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)

print('naive bayes count vector accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vector accuracy 0.7960308710033076


In [435]:
from sklearn.linear_model import SGDClassifier

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)

print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.8125689084895259


In [436]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1, solver='lbfgs', multi_class='multinomial', max_iter=1000)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)

print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

log reg count vectors accuracy 0.802646085997795
