In [0]:
import gdown
import os
from pandas_profiling import ProfileReport

# https://drive.google.com/file/d/1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3/view?usp=sharing

url = f'https://drive.google.com/uc?id=1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3'
filename = 'dataset.csv'
if not os.path.exists(output):
    gdown.download(url, output, quiet=False)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [0]:
columns = ['emotion', 'text']

df = pd.read_csv(filename, names=columns)

In [10]:
df.head()

Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [13]:
df.describe()

Unnamed: 0,emotion,text
count,7446,7446
unique,7,7379
top,joy,When my grandfather died.
freq,1082,8


+ We have $7$ emotion category
+ We have Total of $7446$ data
+ Clas `joy` has the highest number of data

### Number of data for each emotion category

In [11]:
df['emotion'].value_counts()

joy        1082
sadness    1074
anger      1069
fear       1063
disgust    1059
shame      1059
guilt      1040
Name: emotion, dtype: int64

+ We have $7$ emotion types
+ The data seems to be quite balanced

## Text information

In [20]:
df['text'][0]

'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.'

In [35]:
combined_text  = df['text'].str.cat()
combined_text[:150]

'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom '

In [36]:
len(combined_text)

860339

In [37]:
# remove all the new line chars
combined_text = combined_text.replace('\n', '')

# replace fullstop with blank
combined_text = combined_text.replace('.', '')

combined_text[:150]

'On days when I feel close to my partner and other friends   When I feel at peace with myself and also experience a close  contact with people whom I r'

In [38]:
len(combined_text)

839418

In [0]:
# convert all to lowercase
combined_text = combined_text.lower()

In [40]:
words_splitted = combined_text.split()
words_splitted[:10]

['on', 'days', 'when', 'i', 'feel', 'close', 'to', 'my', 'partner', 'and']

In [41]:
# total number of words
len(words_splitted)

156188

In [42]:
unique_words = list(set(words_splitted))
unique_words[:20]

['luggage',
 'analyze',
 'sunshine',
 'packet',
 'examn,',
 'right)losing',
 'mesaw',
 'brothersi',
 'divorced',
 'dialect)',
 'over-estimating',
 'hypocrisy',
 'friendsuddenly',
 'name)when',
 'situation[',
 'casei',
 'insect,',
 'tubei',
 'confessed',
 'overcooked']

In [43]:
len(unique_words)

15583

In [0]:
word_freq = sorted([(w, words_splitted.count(w)) for w in unique_words], key=lambda a: a[1], reverse=True)

In [55]:
# Top 10 high frequency words
print(", ".join([f"{w[0]}: {w[1]}" for w in word_freq[:20]]))

i: 8762, the: 5749, a: 5614, to: 4609, and: 4501, was: 4452, my: 4235, of: 2969, in: 2620, had: 2520, that: 2056, me: 1759, for: 1572, with: 1520, not: 1457, when: 1403, at: 1292, it: 1194, on: 956, very: 954


+ As seen above, the stop words like _i_, _the_ are highly frequent words.

+ They does not contribute much in the sentiment of the sentence, so we can remove

+ We may not want to remove these for some models and remove these for others according to their performance

In [56]:
# Top 10 least frequent words
print(", ".join([f"{w[0]}: {w[1]}" for w in word_freq[-20:]]))

continued,: 1, corpsein: 1, senile: 1, assistati: 1, begining,: 1, procrastinatedi: 1, delicious: 1, (17): 1, truthwhen: 1, punishedat: 1, i;e: 1, falsitywhen: 1, roommatei: 1, planon: 1, goodsthe: 1, "prince": 1, spiderwhen: 1, element": 1, hostelit: 1, accidentaly: 1


+ The low frequency words are words with spelling mistake

+ Adjectives, adverbs, past or past participles of a word

+ Very rare words

+ Converting the word `accidentaly` to `accidental` might be helpful (changing words to their root word)

## With NLTK

In [60]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [65]:
all_text  = df['text'].str.cat()
sent_tokenized = sent_tokenize(all_text)

for i, sent in enumerate(sent_tokenized[:2]):
    print(f'#{i} : {sent}')

#0 : On days when I feel close to my partner and other friends.
#1 : When I feel at peace with myself and also experience a close  
contact with people whom I regard greatly.Every time I imagine that someone I love or I could contact a  
serious illness, even death.When I had been obviously unjustly treated and had no possibility  
of elucidating this.When I think about the short time that we live and relate it to  
the periods of my life when I think that I did not use this  
short time.At a gathering I found myself involuntarily sitting next to two  
people who expressed opinions that I considered very low and  
discriminating.When I realized that I was directing the feelings of discontent  
with myself at my partner and this way was trying to put the blame  
on him instead of sorting out my own feeliings.I feel guilty when when I realize that I consider material things  
more important than caring for my relatives.


In [67]:
word_tokenized = word_tokenize(all_text)
word_tokenized[:20]

['On',
 'days',
 'when',
 'I',
 'feel',
 'close',
 'to',
 'my',
 'partner',
 'and',
 'other',
 'friends',
 '.',
 'When',
 'I',
 'feel',
 'at',
 'peace',
 'with',
 'myself']

In [83]:
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
print(stopwords.words('english')[:10], 'total stopwords: ', len(stopwords.words('english')))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"] total stopwords:  179


In [0]:
def process_text(text):
    text = text if type(text) == str else ''

    # clean the words, remove symbols special chars
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)

    # convert to lowercase
    text = text.lower()

    # first tokenize the text
    word_tokenized = word_tokenize(text)

    # let's remove the stop words
    en_stopwords = stopwords.words('english')
    words_swords_removed = [word for word in word_tokenized if word not in en_stopwords]

    return words_swords_removed

In [93]:
process_text("I don't know. .. what is this ? 😀😀")

['dont', 'know']

In [94]:
r_text = df['text'][0]

r_text_processed = process_text(r_text)

print(f'Original:  {r_text}')
print(f'Processed: {r_text_processed}')

Original:  On days when I feel close to my partner and other friends.   
When I feel at peace with myself and also experience a close  
contact with people whom I regard greatly.
Processed: ['days', 'feel', 'close', 'partner', 'friends', 'feel', 'peace', 'also', 'experience', 'close', 'contact', 'people', 'regard', 'greatly']


In [0]:
all_text_preprocessed = process_text(all_text)

In [0]:
vocab = list(set(all_text_preprocessed))

In [0]:
from collections import Counter

word_freq_nltk = Counter(all_text_preprocessed)

Print top 20 high frequency words in our vocabulary

In [117]:
top_20_most_common = word_freq_nltk.most_common(20)
print(", ".join([f"{w[0]}: {w[1]}" for w in top_20_most_common]))

felt: 872, friend: 787, one: 543, time: 464, got: 428, friends: 388, told: 384, would: 371, home: 362, mother: 360, day: 343, went: 335, school: 303, people: 300, came: 293, first: 292, saw: 291, person: 289, could: 288, go: 276


Print 20 least frequent words

In [120]:
top_20_least_common = word_freq_nltk.most_common(None)[-20:]

print(", ".join([f"{w[0]}: {w[1]}" for w in top_20_least_common]))

mainland: 1, sadduring: 1, britain: 1, accquainted: 1, bbq: 1, departing: 1, personperson: 1, mahjong: 1, decency: 1, halfheartedly: 1, classmatewhen: 1, baptism: 1, pepole: 1, baptised: 1, deepenedwhen: 1, gym: 1, stack: 1, questioning: 1, inserted: 1, randomly: 1


+ The least frequent words are words with spelling mistake, let's fix that.

### Autocorrect for spelling correction

In [108]:
!pip install autocorrect

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/a9/b0/a1d628fa192e8ebf124b4cebc2a42b4e3aa65b8052fdf4888e04fadf3e8d/autocorrect-1.1.0.tar.gz (1.8MB)
[K     |▏                               | 10kB 17.8MB/s eta 0:00:01[K     |▍                               | 20kB 1.7MB/s eta 0:00:02[K     |▌                               | 30kB 2.2MB/s eta 0:00:01[K     |▊                               | 40kB 2.5MB/s eta 0:00:01[K     |█                               | 51kB 2.0MB/s eta 0:00:01[K     |█                               | 61kB 2.2MB/s eta 0:00:01[K     |█▎                              | 71kB 2.5MB/s eta 0:00:01[K     |█▌                              | 81kB 2.8MB/s eta 0:00:01[K     |█▋                              | 92kB 2.9MB/s eta 0:00:01[K     |█▉                              | 102kB 2.8MB/s eta 0:00:01[K     |██                              | 112kB 2.8MB/s eta 0:00:01[K     |██▏                             | 122kB 2.8MB/s eta 0:00

In [0]:
from autocorrect import Speller

spell = Speller(lang='en')

Correct least 100 less common words

In [139]:
top_100_least_common = word_freq_nltk.most_common(None)[-100:]

words_or = []
words_cr = []

for word, _ in top_100_least_common:
    corrected = spell(word)
    if word != corrected:
        words_or.append(word)
        words_cr.append(corrected)

print(f'Original:        {words_or}')
print(f'Spell corrected: {words_cr}')

Original:        ['persoon', 'organizor', 'afterwardsit', 'mrw', 'pmthe', 'dieone', 'starteed', 'beforemy', 'favouron', 'phillipines', 'dawni', 'didcovered', 'thay', 'examonce', 'daymy', 'intolerablei', 'admited', 'numberone', 'besti', 'matesi', 'sexduring', 'organizors', 'leadersone', 'partonce', 'chyne', 'sincereat', 'sadduring', 'accquainted', 'bbq', 'pepole']
Spell corrected: ['person', 'organizer', 'afterwards', 'mrs', 'the', 'ditone', 'started', 'before', 'favour', 'philippines', 'dawn', 'discovered', 'that', 'examine', 'day', 'intolerable', 'admitted', 'numbering', 'best', 'mates', 'securing', 'organizers', 'leaderene', 'patonce', 'chyle', 'sincerest', 'adducing', 'acquainted', 'bbs', 'people']


With pyspellchecker

In [130]:
!pip install pyspellchecker

Collecting pyspellchecker
[?25l  Downloading https://files.pythonhosted.org/packages/04/d1/ec4e830e9f9c1fd788e1459dd09279fdf807bc7a475579fd7192450b879c/pyspellchecker-0.5.4-py2.py3-none-any.whl (1.9MB)
[K     |▏                               | 10kB 18.0MB/s eta 0:00:01[K     |▍                               | 20kB 1.7MB/s eta 0:00:02[K     |▌                               | 30kB 2.2MB/s eta 0:00:01[K     |▊                               | 40kB 2.5MB/s eta 0:00:01[K     |▉                               | 51kB 2.0MB/s eta 0:00:01[K     |█                               | 61kB 2.3MB/s eta 0:00:01[K     |█▏                              | 71kB 2.5MB/s eta 0:00:01[K     |█▍                              | 81kB 2.8MB/s eta 0:00:01[K     |█▌                              | 92kB 2.9MB/s eta 0:00:01[K     |█▊                              | 102kB 2.8MB/s eta 0:00:01[K     |█▉                              | 112kB 2.8MB/s eta 0:00:01[K     |██                              | 12

In [0]:
from spellchecker import SpellChecker

spellc = SpellChecker()

In [141]:
# top_100_least_common = word_freq_nltk.most_common(None)[-100:]

words_or = []
words_cr = []

for word, _ in top_100_least_common:
    corrected = spellc.correction(word)
    if word != corrected:
        words_or.append(word)
        words_cr.append(corrected)

print(f'Original:        {words_or}')
print(f'Spell corrected: {words_cr}')

Original:        ['persoon', 'organizor', 'afterwardsit', 'mrw', 'pmthe', 'dieone', 'sunned', 'starteed', 'beforemy', 'favouron', 'phillipines', 'dawni', 'didcovered', 'thay', 'examonce', 'daymy', 'intolerablei', 'admited', 'numberone', 'besti', 'matesi', 'sexduring', 'organizors', 'partonce', 'chyne', 'sincereat', 'sadduring', 'accquainted', 'bbq', 'mahjong', 'pepole']
Spell corrected: ['person', 'organizer', 'afterwards', 'mr', 'pathe', 'done', 'stunned', 'started', 'before', 'favour', 'philippines', 'dawn', 'discovered', 'that', 'examine', 'day', 'intolerable', 'admitted', 'number-one', 'best', 'mates', 'securing', 'organizers', 'parlance', 'coyne', 'sincerest', 'savouring', 'acquainted', 'bbc', 'mahon', 'people']


Both the spell checker are not doing well. We can try using either one can compare the results later.