In [17]:
import pandas as pd
import numpy as np
import seaborn as sns

from nltk.tokenize import word_tokenize, PunktSentenceTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [18]:
public_df = pd.read_csv('data/public-study/EmoTrak-emotrak-2018-04-19T20_50_09.379Z.csv')

In [19]:
pd.set_option('display.max_columns', 999)

### Train/test split and drop non-string entries

In [20]:
train, test = train_test_split(public_df, shuffle=True)

In [21]:
new_train = train.drop(train[train['trigger'].apply(type) == float].index)
new_test = test.drop(test[test['trigger'].apply(type) == float].index)

In [22]:
new_train['trigger'].apply(type).unique()

array([<class 'str'>], dtype=object)

In [23]:
new_train.shape

(5563, 59)

In [24]:
new_test.shape

(1854, 59)

### Set stopwords, stemmer and tokenizer; create tokenizer function

In [25]:
stop = set(stopwords.words('english'))
snowball = SnowballStemmer('english')
treebank_word_tokenize = TreebankWordTokenizer().tokenize

In [49]:
def tokenize(col):
    tokens = [treebank_word_tokenize(content.lower())for content in col]
    tokens2 = [[word for word in words if word not in stop] for words in tokens]
    snowballs = [[snowball.stem(word) for word in words]
                 for words in tokens2]
    bag_of_words = []
    for lst in snowballs:
        for word in lst:
            bag_of_words.append(word)
    return bag_of_words

    

In [51]:
# tokenize(new_train['trigger'])

### CountVectorizer and TfidfVectorizer

In [74]:
countvect = CountVectorizer(tokenizer=tokenize)
count_vectorized = countvect.fit_transform(new_train['trigger'])

In [75]:
count_vectorized

<5563x67 sparse matrix of type '<class 'numpy.int64'>'
	with 49925 stored elements in Compressed Sparse Row format>

In [57]:
# tokenize(new_test['trigger'])

In [76]:
count_vectorized_test = countvect.transform(new_test['trigger'])

In [77]:
tfidfvect = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidf_vectorized = tfidfvect.fit_transform(new_train['trigger'])

In [78]:
X_train = tfidf_vectorized

In [79]:
tfidf_vectorized_test = tfidfvect.transform(new_test['trigger'])

In [86]:
X_test = tfidf_vectorized_test

### Naive Bayes Model

In [59]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [80]:
y_train = new_train['generalEmotion']
y_test = new_test['generalEmotion']

In [81]:
X_train.shape

(5563, 67)

In [82]:
y_train.shape

(5563,)

In [83]:
X_test.shape

(1856,)

In [84]:
y_test.shape

(1854,)

In [85]:
model = MultinomialNB()
model.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [88]:
predictions = model.predict(X_test)

In [89]:
pred_df = pd.DataFrame(predictions)

In [95]:
pred_df[0].value_counts()

enjoyment    1846
anger           6
fear            2
Name: 0, dtype: int64