In [0]:
import gdown
import os
from pandas_profiling import ProfileReport

# https://drive.google.com/file/d/1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3/view?usp=sharing

url = f'https://drive.google.com/uc?id=1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3'
filename = 'dataset.csv'
if not os.path.exists(filename):
    gdown.download(url, filename, quiet=True)

In [0]:
import numpy as np
import pandas as pd

In [0]:
columns = ['emotion', 'text']

df = pd.read_csv(filename, names=columns)

In [0]:
Xraw = df['text'].values
yraw = df['emotion'].values

### Preprocessing

In [24]:
import nltk

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
!pip install autocorrect

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/a9/b0/a1d628fa192e8ebf124b4cebc2a42b4e3aa65b8052fdf4888e04fadf3e8d/autocorrect-1.1.0.tar.gz (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.8MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-1.1.0-cp36-none-any.whl size=1810772 sha256=f6086c04a496416cafc16fe571bcfd90ad783ca20ff175603b24aed6601f5b0b
  Stored in directory: /root/.cache/pip/wheels/78/7f/b1/527522820ae623df6a2dbe14f778d23adaea4bebe43f7ebcfe
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-1.1.0


In [0]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from autocorrect import Speller

spell = Speller(lang='en')

import re

def process_text(text):
    text = text if type(text) == str else ''

    # clean the words, remove symbols special chars
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)

    # convert to lowercase
    text = text.lower()

    # first tokenize the text
    word_tokenized = word_tokenize(text)

    # let's remove the stop words
    en_stopwords = stopwords.words('english')
    words_swords_removed = [spell(word) for word in word_tokenized if word not in en_stopwords]

    return words_swords_removed

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
cvector = CountVectorizer(tokenizer=process_text)

In [0]:
X_tr = cvector.fit_transform(Xraw)

In [39]:
X_tr

<7446x8686 sparse matrix of type '<class 'numpy.int64'>'
	with 70234 stored elements in Compressed Sparse Row format>

In [44]:
help(X_tr)

Help on csr_matrix in module scipy.sparse.csr object:

class csr_matrix(scipy.sparse.compressed._cs_matrix)
 |  Compressed Sparse Row matrix
 |  
 |  This can be instantiated in several ways:
 |      csr_matrix(D)
 |          with a dense matrix or rank-2 ndarray D
 |  
 |      csr_matrix(S)
 |          with another sparse matrix S (equivalent to S.tocsr())
 |  
 |      csr_matrix((M, N), [dtype])
 |          to construct an empty matrix with shape (M, N)
 |          dtype is optional, defaulting to dtype='d'.
 |  
 |      csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
 |          where ``data``, ``row_ind`` and ``col_ind`` satisfy the
 |          relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
 |  
 |      csr_matrix((data, indices, indptr), [shape=(M, N)])
 |          is the standard CSR representation where the column indices for
 |          row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
 |          corresponding values are stored in ``data[indptr[i]:

In [0]:
from sklearn.preprocessing import LabelEncoder

In [0]:
label_encoder = LabelEncoder()

In [49]:
label_encoder.fit(yraw)

LabelEncoder()

In [50]:
label_encoder.classes_

array(['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame'],
      dtype=object)

In [52]:
y_tr = label_encoder.transform(yraw)
y_tr

array([4, 2, 0, ..., 1, 6, 3])

In [53]:
y_tr.shape

(7446,)

### Naive bayes classifier

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
multi_nb = MultinomialNB()

In [56]:
multi_nb.fit(X_tr, y_tr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
y_pred = multi_nb.predict(X_tr)

In [61]:
from sklearn.metrics import classification_report

print(classification_report(y_tr, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      1069
           1       0.87      0.78      0.82      1059
           2       0.83      0.83      0.83      1063
           3       0.74      0.77      0.75      1040
           4       0.81      0.85      0.83      1082
           5       0.79      0.80      0.79      1074
           6       0.77      0.75      0.76      1059

    accuracy                           0.79      7446
   macro avg       0.79      0.79      0.79      7446
weighted avg       0.79      0.79      0.79      7446



With a simple naive bayes classifier, we have got an accuracy of $79\%$. We are going to take this simple model as a baseline model.