In [0]:
import pandas as pd
import numpy as np
data = pd.read_csv('dataset.csv')

data = data.drop(data[data.mood == 'love'].index)
data = data.drop(data[data.mood == 'neutral'].index)

In [48]:
import nltk
nltk.download('stopwords')

#Making all letters lowercase
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#Removing Punctuation, Symbols
data['text'] = data['text'].str.replace('[^\w\s]',' ')

#Removing Stop Words using NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
nltk.download('wordnet')

#Lemmatisation
from textblob import Word
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

#Correcting Letter Repetitions
import re
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

#%%
data['text'] = data['text'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(data.mood.values)

lbl_mood_mapping = dict(zip(lbl_enc.classes_, lbl_enc.transform(lbl_enc.classes_)))
print(lbl_mood_mapping)

{'happiness': 0, 'sadness': 1, 'worry': 2}


In [0]:
# Splitting into training and testing data in 90:10 ratio
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data.text.values, y, test_size=0.1)

In [0]:
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word',ngram_range=(1,3))
count_vect.fit(data['text'])
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [53]:
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)

print('naive bayes count vector accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes count vector accuracy 0.5514345696291113


In [54]:
from sklearn.linear_model import SGDClassifier

lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)

print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.5787263820853744


In [55]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1, solver='lbfgs', multi_class='multinomial', max_iter=1000)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)

print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

log reg count vectors accuracy 0.5780265920223933
