# Categorizing Emotion Words

In [66]:
#Packages to help with analysis
import matplotlib.pyplot as plt
import nltk, re, tweepy 
import numpy as np
import pandas as pd
import pyreadstat
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from collections import Counter
import sklearn
from textblob import TextBlob
from sklearn.manifold import TSNE
from textblob import Word
from spellchecker import SpellChecker
%matplotlib inline

## Let's Take a Look at the Data from The COVID-19 Well-Being Dataset

In [94]:
pd.set_option('display.max_columns', None)

In [95]:
df, meta = pyreadstat.read_sav('Emotional Well-Being - CASEL COVID-19 Webinar_Anon.sav')

In [96]:
df.head()

Unnamed: 0,StartDate,EndDate,Progress,Duration__in_seconds_,Finished,RecordedDate,ResponseId,OpenEmotions_1,OpenEmotions_3,OpenEmotions_4,ClosedEmotions_1,ClosedEmotions_2,ClosedEmotions_3,ClosedEmotions_4,ClosedEmotions_5,ClosedEmotions_6,StressCauses_1,StressCauses_12,StressCauses_13,JoyCauses_1,JoyCauses_12,JoyCauses_13,Strats_EffectiveSelf_1,Strats_EffectiveSelf_2,Strats_EffectiveSelf_3,Strats_EffectiveOthe_1,Strats_EffectiveOthe_2,Strats_EffectiveOthe_3,ER_1,ER_2,WellBeingSupport_16,WellBeingSupport_17,WellBeingSupport_18,CASELWebinars_16,CASELWebinars_17,CASELWebinars_18,Age,Race_1,Race_2,Race_3,Race_4,Race_5,Race_6,Race_7,Race_8,Race_8_TEXT,Gender,Gender_3_TEXT,Educator,Role_11,Role_12,Role_1,Role_2,Role_3,Role_4,Role_5,Role_6,Role_7,Role_8,Role_10,Role_10_TEXT,Grade_1,Grade_2,Grade_3,Grade_4,StudentSES_1
0,2020-03-24 16:47:03,2020-03-24 16:52:06,100.0,302.0,1.0,2020-03-24 16:52:06,R_Q64hfjjdCiGnM09,Anxious,Sad,Fearful,6.0,6.0,3.0,5.0,5.0,3.0,uncertainty,News,updates,kids,work,family,stop soical media,running,journaling,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-03-24 16:48:25,2020-03-24 16:52:45,100.0,259.0,1.0,2020-03-24 16:52:45,R_Anw03Hj4s7jxnfH,anxiety,fear,hope,5.0,6.0,3.0,4.0,3.0,4.0,worry about others,worry about self,loss of normal routine,connecting with others,,,yoga,connecting with others,,listen to them,provide facts,provide resources,3.0,4.0,Less cognitive demands,Show grace and understanding,Listen to what people need,,,,39.0,,,,,,,1.0,,,2.0,,2.0,,,,,,,,,,,,,,,,,
2,2020-03-24 16:51:16,2020-03-24 16:52:49,100.0,92.0,1.0,2020-03-24 16:52:49,R_32UYohfSNVrkECi,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-03-24 16:47:38,2020-03-24 16:53:13,100.0,334.0,1.0,2020-03-24 16:53:13,R_24PeWDZ57ldPxMy,Fear,Sadness,Anger,5.0,5.0,4.0,5.0,4.0,3.0,Uncertainty,Change,Finances,Family,Connectedness,Quiet,Breathing,Meditation,Exercise,Calm leadership,Meditation,Optimism,5.0,6.0,Reassurance,Non-education related activities that increase...,"Ways of increasing normalcy, routine, and conn...",,,,36.0,,,,,,,1.0,,,2.0,,1.0,1.0,,,,,,,,,,,,,,,1.0,5.0
4,2020-03-24 16:46:45,2020-03-24 16:53:26,100.0,400.0,1.0,2020-03-24 16:53:26,R_wMCGldjiVHje0o1,Panic,Terror,Depression,3.0,5.0,4.0,4.0,4.0,5.0,Fear for my health,Fear for my family’s health,Social isolation,My son’s smile,My son’s laughter,Old VMs from a friend,Yoga,EFT,Nature,Outside time,Validating feelings,Hugging,5.0,5.0,Acceptance of all feelings,Tools to use in the moment,Space,Access to physical touch,Self reg strategies,EFT,,,,,,,,1.0,,,2.0,,1.0,,,,,,,,,,,1.0,Private SLP,1.0,1.0,,1.0,8.0


### Look specifically at all the emotions people are feeling during the COVID-19 crisis

In [97]:
most_feelings_current_climate = []
aux = np.append(most_feelings_current_climate, df['OpenEmotions_1'])
aux2 = np.append(most_feelings_current_climate, df['OpenEmotions_3'])
aux3 = np.append(most_feelings_current_climate, df['OpenEmotions_4'])
all_strats = pd.Series(pd.Series([x.lower() for x in aux2]))
ordered_counts_strat = all_strats.value_counts()

#### Look at the initial most felt emotions

In [98]:
all_feelings_current_climate = all_strats
ordered_counts = all_feelings_current_climate.value_counts()
print("top ten feelings:\n", ordered_counts[0:10])
print('\n')
print("Number of Unique Feelings:", len(ordered_counts))
print('\n')
print("ten unique feelings:\n", ordered_counts[390:])

top ten feelings:
 fear           308
anxious        296
anxiety        296
overwhelmed    227
sadness        170
worry          160
frustration    147
stress         146
uncertainty    144
worried        133
dtype: int64


Number of Unique Feelings: 576


ten unique feelings:
 guilty          1
uninformed      1
surreal         1
shame           1
disturbed       1
               ..
ok              1
responsible     1
support         1
separate        1
hopefullness    1
Length: 186, dtype: int64


## Clean the data


In [99]:
# replace '-' with ' ', then remove punctuation
all_feelings_current_climate = [re.sub('-', ' ', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]
all_feelings_current_climate = [re.sub('[^\w\s]', '', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]

# replace multiple spaces by a single space
all_feelings_current_climate = [re.sub('[ ]+', ' ', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]

len(all_feelings_current_climate)

5161

In [100]:
transcript_tokenized = []
for i in np.arange(len(all_feelings_current_climate)):
    processed = all_feelings_current_climate[i].lower().strip().split(', ')
    transcript_tokenized.append(processed)
emotion_clean = np.hstack(transcript_tokenized)
len(emotion_clean)

5161

In [101]:
def is_numeric(string): ##Removes numerical responses
    for char in string:
        if char.isdigit():
            return True
    return False

def empty_string(string): ##Removes Empty Strings
    return string == ''

def remove_string(string):
    return is_numeric(string) | empty_string(string)

In [102]:
##removes empty spaces and numbers
temp = []
for token in emotion_clean:
    if not remove_string(token):
        temp.append(token)

emotion_token = temp
emotion_token

['sad',
 'fear',
 'sadness',
 'terror',
 'concern',
 'overwhelm',
 'love',
 'uncertainty',
 'joy',
 'isolated',
 'isolated',
 'fear',
 'anxious',
 'nervous',
 'empathy',
 'worried',
 'anxious',
 'anxiety',
 'worry',
 'insecurity',
 'overwhelmed',
 'fatigue',
 'vulnerable',
 'exhaustion',
 'sadness',
 'helpless',
 'exasperated',
 'fear',
 'scared',
 'boredom',
 'grateful',
 'anxious',
 'drained',
 'gratitude',
 'helping',
 'sadness',
 'nervous tension',
 'anxiety',
 'sad',
 'sad',
 'sadness',
 'anxious',
 'frustration',
 'anxiety',
 'loving to others',
 'concerned',
 'stressed',
 'fear',
 'excitement',
 'anger',
 'gratitude',
 'wonder',
 'compassion',
 'hopeful',
 'vigilant',
 'stress',
 'irritated',
 'sadness',
 'overwhelmed',
 'uncertain',
 'uncertainty',
 'worry',
 'relaxed',
 'anxiety',
 'reflective',
 'togetherness',
 'stress',
 'isolated',
 'frustrated',
 'unsure',
 'anxious',
 'frustrated',
 'anxious',
 'scared',
 'scared',
 'weariness',
 'worried for students',
 'uncertainty',
 

In [87]:
def single_and_multi(emotion_token):
    long_resp =[]
    short_resp = []
    for i in emotion_token:
            if len(i.split()) > 1:
                long_resp = np.append(long_resp, i)  
            else:
                short_resp = np.append(short_resp, i)
                
    #Returns a list of to lists short and long responses
    return [short_resp, long_resp]
                


In [88]:
len(single_and_multi(emotion_token)[0])

4938

In [89]:
single = single_and_multi(emotion_token)
def misspelled(single_words):
    spell = SpellChecker()

    misspelled = spell.unknown(single_words)
    correct = spell.known(single_words)
    list_misspelled = list(misspelled)
    return list_misspelled


wrong_words = list(misspelled(single[0]))

def corrections(misspelled):
    correction_list = []
    for word in misspelled:
        # Get the one `most likely` answer
        correction = spell.correction(word)
        correction_list = np.append(correction_list, correction)
    #returns two lists, one of the wrong words, one the correct
    return [misspelled, correction_list]

corrections(wrong_words)
correct_single_words = spell.known(single[0])
corrected_single_words = corrections(wrong_words)[1]
long_responses = single[1]

type(long_responses)
#new_word_list = correct_single_words + corrected_single_words + long_responses



numpy.ndarray

In [90]:
# correction = corrections(wrong_words)

# d = {'misspelled' : corrections[0],
#      'correction_list': corrections[1]}
# pd.DataFrame(d)
corrections(wrong_words)

[['ovewhelmed',
  'hopefullness',
  'selfconfidence',
  'compasion',
  'nervouse',
  'unmmotivated',
  'wowed',
  'reiliance',
  'disagrement',
  'disappointmemt',
  'jou',
  'curiousity',
  'discombobulated',
  'coraje',
  'sadnesz',
  'intruiged',
  'mindfulness',
  'greif',
  'aprehension',
  'oberwhelmed',
  'acceptence',
  'vigor',
  'saddness',
  'helplessnes',
  'frustation',
  'overstimulated',
  'scatterdness',
  'underappreciated',
  'preocuppied',
  'gratitud',
  'emotionalteary',
  'gloominess',
  'stressuncertainty',
  'notconsentingtothissurvey',
  'gradititude',
  'overwhlemed',
  'disappoinment',
  'sureal',
  'pensiveness',
  'lonliness',
  'cinfused',
  'optimisim',
  'wortied',
  'overthinking',
  'unuseful',
  'uncertaint',
  'fired_up',
  'overwhelemed',
  'uncentered',
  'lonley',
  'gratefulness',
  'helplesss',
  'disconnectedness',
  'micromanaged',
  'stressfull',
  'disapppointment',
  'meh',
  'releived',
  'bummed',
  'uncertainity',
  'sleepliness',
  'fru

In [103]:
d = pd.Series(emotion_token).value_counts()

df = pd.DataFrame(d)
df.to_excel("Open Emotion First Response Data Second.xlsx", sheet_name='second response')  
    

In [104]:


new_word_list = np.concatenate((list(correct_single_words), corrected_single_words, long_responses), axis = None)


len(new_word_list)


691

## Creates a list of all properly spelled words 

In [12]:
def properlyspelled(cleaned_emotion_tokens):
    long_resp = []
    short_resp = []
    for i in cleaned_emotion_tokens:
        if len(i.split()) > 1:
            long_resp = np.append(long_resp, i)  
        else:
            short_resp = np.append(short_resp, i)
    spell = SpellChecker()
    misspelled = spell.unknown(short_resp)
    list_misspelled = list(misspelled)

    list_misspelled = np.append(list_misspelled, 'disorientate')
    list_misspelled = np.append(list_misspelled, 'vunerable')

    #Remove misspelled words
    list1 = [ele for ele in short_resp if ele not in list_misspelled]
    return list1

In [25]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
   
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = properlyspelled(emotion_token)

new_words = []
for w in words:
    new_words = np.append(new_words, ps.stem(w))
    
test = pd.Series(new_words)
    
open_emotion = pd.Series(properlyspelled(emotion_token))

ordered_counts_test = test.value_counts()
ordered_counts = open_emotion.value_counts()
#print("top ten feelings:\n", ordered_counts_test[30:90])
len(words)

14651

In [20]:
##Compresses Repeated Values, so every value is unique
unique_emotion_tokens = list(set(emotion_token))
print("There are", len(unique_emotion_tokens),"unique emotion words in the emotion data")


There are 1070 unique emotion words in the emotion data


## So far, we have cleaned our data down to 1070 unique tokens form an original 15,000+ responses.

**Next Steps**
- Split the tokens into single word and multiword responses 
- Look at the misspelled words in the data
- Lemmetize the data and compress to unique values again
-
-
-
-

**There are misspelling in the data. Let deal with that by looking at the misspelled words**

### Step 1: Splits the tokens into single word and multiword responses
Calls single word responses short_resp and multiword responses long_resp

In [21]:
long_resp = []
short_resp = []
for i in unique_emotion_tokens:
    if len(i.split()) > 1:
        long_resp = np.append(long_resp, i)  
    else:
        short_resp = np.append(short_resp, i)
len(long_resp)

138

### Step 2: Looks at the misspelled words in the data
Removes misspelled words from the data

# FLAG
Should the misspelled data be completely removed?

In [22]:
spell = SpellChecker()
misspelled = spell.unknown(short_resp)
list_misspelled = list(misspelled)

list_misspelled = np.append(list_misspelled, 'disorientate')
list_misspelled = np.append(list_misspelled, 'vunerable')

#Remove misspelled words
list1 = [ele for ele in short_resp if ele not in list_misspelled]
len(list1)

767