# Categorizing Emotion Words

In [1]:
#Packages to help with analysis
import matplotlib.pyplot as plt
import nltk, re, tweepy 
import numpy as np
import pandas as pd
import pyreadstat
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
from collections import Counter
import sklearn
from textblob import TextBlob
from sklearn.manifold import TSNE
from textblob import Word
from spellchecker import SpellChecker
%matplotlib inline


## Let's Take a Look at the Data from The COVID-19 Well-Being Dataset

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df, meta = pyreadstat.read_sav('Emotional Well-Being - CASEL COVID-19 Webinar_Anon.sav')

In [4]:
df.head()

Unnamed: 0,StartDate,EndDate,Progress,Duration__in_seconds_,Finished,RecordedDate,ResponseId,OpenEmotions_1,OpenEmotions_3,OpenEmotions_4,ClosedEmotions_1,ClosedEmotions_2,ClosedEmotions_3,ClosedEmotions_4,ClosedEmotions_5,ClosedEmotions_6,StressCauses_1,StressCauses_12,StressCauses_13,JoyCauses_1,JoyCauses_12,JoyCauses_13,Strats_EffectiveSelf_1,Strats_EffectiveSelf_2,Strats_EffectiveSelf_3,Strats_EffectiveOthe_1,Strats_EffectiveOthe_2,Strats_EffectiveOthe_3,ER_1,ER_2,WellBeingSupport_16,WellBeingSupport_17,WellBeingSupport_18,CASELWebinars_16,CASELWebinars_17,CASELWebinars_18,Age,Race_1,Race_2,Race_3,Race_4,Race_5,Race_6,Race_7,Race_8,Race_8_TEXT,Gender,Gender_3_TEXT,Educator,Role_11,Role_12,Role_1,Role_2,Role_3,Role_4,Role_5,Role_6,Role_7,Role_8,Role_10,Role_10_TEXT,Grade_1,Grade_2,Grade_3,Grade_4,StudentSES_1
0,2020-03-24 16:47:03,2020-03-24 16:52:06,100.0,302.0,1.0,2020-03-24 16:52:06,R_Q64hfjjdCiGnM09,Anxious,Sad,Fearful,6.0,6.0,3.0,5.0,5.0,3.0,uncertainty,News,updates,kids,work,family,stop soical media,running,journaling,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-03-24 16:48:25,2020-03-24 16:52:45,100.0,259.0,1.0,2020-03-24 16:52:45,R_Anw03Hj4s7jxnfH,anxiety,fear,hope,5.0,6.0,3.0,4.0,3.0,4.0,worry about others,worry about self,loss of normal routine,connecting with others,,,yoga,connecting with others,,listen to them,provide facts,provide resources,3.0,4.0,Less cognitive demands,Show grace and understanding,Listen to what people need,,,,39.0,,,,,,,1.0,,,2.0,,2.0,,,,,,,,,,,,,,,,,
2,2020-03-24 16:51:16,2020-03-24 16:52:49,100.0,92.0,1.0,2020-03-24 16:52:49,R_32UYohfSNVrkECi,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-03-24 16:47:38,2020-03-24 16:53:13,100.0,334.0,1.0,2020-03-24 16:53:13,R_24PeWDZ57ldPxMy,Fear,Sadness,Anger,5.0,5.0,4.0,5.0,4.0,3.0,Uncertainty,Change,Finances,Family,Connectedness,Quiet,Breathing,Meditation,Exercise,Calm leadership,Meditation,Optimism,5.0,6.0,Reassurance,Non-education related activities that increase...,"Ways of increasing normalcy, routine, and conn...",,,,36.0,,,,,,,1.0,,,2.0,,1.0,1.0,,,,,,,,,,,,,,,1.0,5.0
4,2020-03-24 16:46:45,2020-03-24 16:53:26,100.0,400.0,1.0,2020-03-24 16:53:26,R_wMCGldjiVHje0o1,Panic,Terror,Depression,3.0,5.0,4.0,4.0,4.0,5.0,Fear for my health,Fear for my family’s health,Social isolation,My son’s smile,My son’s laughter,Old VMs from a friend,Yoga,EFT,Nature,Outside time,Validating feelings,Hugging,5.0,5.0,Acceptance of all feelings,Tools to use in the moment,Space,Access to physical touch,Self reg strategies,EFT,,,,,,,,1.0,,,2.0,,1.0,,,,,,,,,,,1.0,Private SLP,1.0,1.0,,1.0,8.0


### Look specifically at all the emotions people are feeling during the COVID-19 crisis

In [5]:
most_feelings_current_climate = []
aux = np.append(most_feelings_current_climate, df['OpenEmotions_1'])
aux1 = np.append(aux, df['OpenEmotions_3'])
aux2strat = np.append(aux1, df['OpenEmotions_4'])

all_strats = pd.Series(pd.Series([x.lower() for x in aux2strat]))
ordered_counts_strat = all_strats.value_counts()

#### Look at the initial most felt emotions

In [7]:
all_feelings_current_climate = all_strats
ordered_counts = all_feelings_current_climate.value_counts()
ordered_counts

anxiety               1317
anxious               1103
fear                   743
overwhelmed            697
uncertainty            481
                      ... 
sleep deprived           1
lack of connection       1
compassion fatigue       1
withdrawal               1
ambiguous                1
Length: 1080, dtype: int64

## Clean the data


In [7]:
# replace '-' with ' ', then remove punctuation
all_feelings_current_climate = [re.sub('-', ' ', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]
all_feelings_current_climate = [re.sub('[^\w\s]', '', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]

# replace multiple spaces by a single space
all_feelings_current_climate = [re.sub('[ ]+', ' ', all_feelings_current_climates) for all_feelings_current_climates in all_feelings_current_climate]

len(all_feelings_current_climate)

15483

In [8]:
transcript_tokenized = []
for i in np.arange(len(all_feelings_current_climate)):
    processed = all_feelings_current_climate[i].lower().strip().split(', ')
    transcript_tokenized.append(processed)
emotion_clean = np.hstack(transcript_tokenized)
emotion_clean

array(['anxious', 'anxiety', '', ..., 'frustration', 'anxious', 'anxiety'],
      dtype='<U77')

In [9]:
def is_numeric(string): ##Removes numerical responses
    for char in string:
        if char.isdigit():
            return True
    return False

def empty_string(string): ##Removes Empty Strings
    return string == ''

def remove_string(string):
    return is_numeric(string) | empty_string(string)

In [10]:
##removes empty spaces and numbers
temp = []
for token in emotion_clean:
    if not remove_string(token):
        temp.append(token)

emotion_token = temp
emotion_token

['anxious',
 'anxiety',
 'fear',
 'panic',
 'fear',
 'anxiety',
 'uncertainty',
 'anxiety',
 'excitement',
 'fear',
 'anxious',
 'sadness',
 'overwhelmed',
 'thankful',
 'anxiety',
 'stressed',
 'thoughtful',
 'stress',
 'stress',
 'uncertainty',
 'anxious',
 'exasperation',
 'anxious',
 'anxiety',
 'anxiety',
 'stress',
 'doubtful',
 'uncertainty',
 'anxiety',
 'anxiety',
 'present',
 'overwhelm',
 'exhausted',
 'empathy',
 'concern',
 'worry',
 'anxiety',
 'stress',
 'overwhelmed',
 'worried',
 'anxiety',
 'overwhelmed',
 'anxiousness',
 'fear',
 'exciting',
 'anxious',
 'anxious',
 'anxiety',
 'anxiety',
 'anxiety',
 'anxiety',
 'worry',
 'confusion',
 'anxious',
 'anxious',
 'anxiety',
 'worried',
 'anxiety',
 'anxious',
 'confused',
 'anxiety',
 'anxiety',
 'anxious',
 'fear',
 'creative',
 'anxiety',
 'anxiety',
 'overwhlemed',
 'overwhelmed',
 'anxious',
 'uncertain',
 'stressed',
 'overwhelmed',
 'worry',
 'confused',
 'stress',
 'anxious',
 'freedom',
 'uncertainty',
 'stresse

## Creates a list of all properly spelled words 

In [12]:

short_resp = []
long_resp = []
for i in emotion_token:
    if len(i.split()) > 1:
        long_resp = np.append(long_resp, i)  
    else:
        short_resp = np.append(short_resp, i)


spell = SpellChecker()
misspelled = spell.unknown(short_resp)
list_misspelled = list(misspelled)

len(list_misspelled)

corrected_words = []
for word in list_misspelled:
    # Get the one `most likely` answer
    corrected_words = np.append(corrected_words, spell.correction(word) )
    
    

In [19]:
new_group = np.append(corrected_words, short_resp)
len(new_group)

15113

### Lemmatizes data, returning a normalized version of words (ex: "cats" returns "cat")
This should help group some words together 

In [17]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

processed1 = []
for token in new_group:
    processed1.append(lemmatizer.lemmatize(token))

unique_emotion_data = processed1

In [27]:
y = pd.Series(unique_emotion_data)

y2 = y.value_counts()

y3 = y2.to_frame()

y3.to_excel("Open Survey.xlsx")  






In [28]:
y3

Unnamed: 0,0
anxiety,1322
anxious,1105
fear,744
overwhelmed,707
uncertainty,486
...,...
non-productive,1
resourceful,1
releived,1
regretful,1


In [25]:
sheet_to_df_map = pd.read_excel(y3, sheet_name="Open Survey")

ValueError: Invalid file path or buffer object type: <class 'method'>

### Step 1: Splits the tokens into single word and multiword responses
Calls single word responses short_resp and multiword responses long_resp

In [21]:
long_resp = []
short_resp = []
for i in unique_emotion_tokens:
    if len(i.split()) > 1:
        long_resp = np.append(long_resp, i)  
    else:
        short_resp = np.append(short_resp, i)
len(long_resp)

138