## Data Wrangling

### Libraries:

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

#Statistics:
import scipy as scipy
from scipy import stats
from scipy.stats import chi2_contingency

### Functions:

In [2]:
def total_emotions(df, list_of_emotions):
    #Creating the new variable
    df["total_emotions"] = df[list_of_emotions].sum(axis=1)

    return df.head(3)

In [3]:
def emotion_binarizer(df, list_of_emotions):
    #Binarizing the emotions
    for emotion in list_of_emotions:
        df[emotion] = df[emotion].apply(lambda cell:1 if cell >=1 else 0)

    return df.head(3)

### Uploading the documents:

In [4]:
#Downloading the files
raw_df1 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv')
raw_df2 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv')
raw_df3 = pd.read_csv('https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv')

In [5]:
#Merging them in a single DF
raw_df4 = pd.concat([raw_df1, raw_df2, raw_df3]).reset_index(drop=True)

#Sanity check:
raw_df4.head(3)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#Increasing display option to see the full comments:
pd.set_option('display.max_colwidth', None)

In [7]:
# List of emotions
column_list = raw_df4.columns.tolist()

# Obtaining the columns that will be excluded
columns_to_exclude = ("text", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id", "id", "example_very_unclear")

# Final list of columns
emotions_list = [c for c in column_list if c not in columns_to_exclude]

#Review:
print(emotions_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


### Reviewing duplicates and NaNs

In [8]:
#High level info:
raw_df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211225 entries, 0 to 211224
Data columns (total 37 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   text                  211225 non-null  object 
 1   id                    211225 non-null  object 
 2   author                211225 non-null  object 
 3   subreddit             211225 non-null  object 
 4   link_id               211225 non-null  object 
 5   parent_id             211225 non-null  object 
 6   created_utc           211225 non-null  float64
 7   rater_id              211225 non-null  int64  
 8   example_very_unclear  211225 non-null  bool   
 9   admiration            211225 non-null  int64  
 10  amusement             211225 non-null  int64  
 11  anger                 211225 non-null  int64  
 12  annoyance             211225 non-null  int64  
 13  approval              211225 non-null  int64  
 14  caring                211225 non-null  int64  
 15  

In [9]:
print(f"""
      - Number of NaN values: {raw_df4.isna().sum().sum()}
      - Number of duplicate values: {raw_df4.duplicated().sum()}
      """)


      - Number of NaN values: 0
      - Number of duplicate values: 0
      


### Deleting "Example_very_unclear"

In [10]:
#Reviewing the comments of example very unclear
raw_df4[raw_df4["example_very_unclear"]==True]

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
1,">sexuality shouldn’t be a grouping category It makes you different from othet ppl so imo it fits the definition of ""grouping""",eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1.548084e+09,37,True,0,...,0,0,0,0,0,0,0,0,0,0
62,Oi us cancers don’t want him either! We respectfully pass his birthday to April,ee0ai3t,schwiftypants39,Justfuckmyshitup,t3_afgkgb,t1_edzkirq,1.547418e+09,22,True,0,...,0,0,0,0,0,0,0,0,0,0
195,I unfortunately can not afford a lawyer. Luckily the only thing that we shared is our daughter,ed0rtl7,Renissancelady,Divorce,t3_abjqns,t1_ed0rm5b,1.546364e+09,12,True,0,...,0,0,0,0,0,0,0,0,0,0
262,OMG THOSE TINY SHOES! *desire to boop snoot intensifies*,eei8tja,I_69_WITH_DOGS,chicago,t3_ahu4m4,t3_ahu4m4,1.547963e+09,61,True,0,...,0,0,0,0,0,0,0,0,0,0
264,"I would pay money for [NAME] to not be my nurse. She will probably ""accudentally"" squeeze my nuts. Crazy bitch.",ee8zjas,-Azwel-,SoulCalibur,t3_agt3qw,t3_agt3qw,1.547697e+09,22,True,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211037,_____ is ruining _____ because kids these days.,edrbjwb,LeastCoordinatedJedi,moviescirclejerk,t3_aejtjn,t1_edqe5al,1.547164e+09,57,True,0,...,0,0,0,0,0,0,0,0,0,0
211082,"yea I agree, and [NAME] is way younger so has trade value. [NAME] is still an offensive powerhouse but his DWAR is slumping.",ed99rjp,DukeofPoundtown,Dodgers,t3_ackdki,t1_ed98zg9,1.546640e+09,37,True,0,...,0,0,0,0,0,0,0,0,0,0
211113,"Huh, assumed he was more of a grindr fella .....",efecc3d,Supreme_Dear_Leader,90dayfianceuncensored,t3_alicxq,t3_alicxq,1.548890e+09,61,True,0,...,0,0,0,0,0,0,0,0,0,0
211214,Well when you’ve imported about a gazillion of them I or your country it’s gets serious.,ef28nod,5inchloser,nottheonion,t3_ak26t3,t3_ak26t3,1.548553e+09,61,True,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#Deleting the rows 
raw_df5 = raw_df4[raw_df4["example_very_unclear"] != True].copy()

#Deleting the column since it won't add information
raw_df5.drop("example_very_unclear", inplace=True, axis=1)

#Sanity check:
raw_df5.head(3)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,admiration,amusement,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,0,0,...,1,0,0,0,0,0,0,0,0,0


In [12]:
#Binarizing the non-aggregate DF

#Creating a new raw DF
raw_df6 = raw_df5.copy()

#Binarizing it:
emotion_binarizer(raw_df6, emotions_list)


Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,admiration,amusement,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,0,0,...,1,0,0,0,0,0,0,0,0,0


In [13]:
#Adding the total emotions for the non-aggregate DF
total_emotions(raw_df6, emotions_list)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,admiration,amusement,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,total_emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,0,0,...,0,0,0,0,0,0,0,0,1,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,0,0,...,0,0,0,0,0,0,0,0,0,1


### Consolidating DF

In [14]:
#Repeated and uniques IDs
print(f"""
- Unique comments: {raw_df5["id"].nunique()}
- Total comments: {raw_df5["id"].count()}
      """)


- Unique comments: 58009
- Total comments: 207814
      


In [15]:
#List of sentiment columns
# Obtaining the columns
cols = raw_df5.columns.tolist()

# Obtaining the columns that will be excluded
cols_to_exclude = ("text", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id")

# Final list of columns
final_cols = [c for c in cols if c not in cols_to_exclude]

# Sanity check
print(final_cols)

['id', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [16]:
# Aggregate DF
agg_df = raw_df5[final_cols].copy()

#Sanity check:
agg_df.head(3)

Unnamed: 0,id,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,ed2mah1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [17]:
#Grouping the emotions:
agg_df = agg_df.groupby('id').sum()

#Sanity check:
agg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58009 entries, eczazk6 to efhcuxi
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   admiration      58009 non-null  int64
 1   amusement       58009 non-null  int64
 2   anger           58009 non-null  int64
 3   annoyance       58009 non-null  int64
 4   approval        58009 non-null  int64
 5   caring          58009 non-null  int64
 6   confusion       58009 non-null  int64
 7   curiosity       58009 non-null  int64
 8   desire          58009 non-null  int64
 9   disappointment  58009 non-null  int64
 10  disapproval     58009 non-null  int64
 11  disgust         58009 non-null  int64
 12  embarrassment   58009 non-null  int64
 13  excitement      58009 non-null  int64
 14  fear            58009 non-null  int64
 15  gratitude       58009 non-null  int64
 16  grief           58009 non-null  int64
 17  joy             58009 non-null  int64
 18  love            58009 n

In [18]:
# Making an aggregate DF with the other variables to join it with the agg DF
agg_df2 = raw_df5[["id", "text", "author", "subreddit", "link_id", "parent_id", "created_utc"]]

#Sanity check:
agg_df2.head(3)

Unnamed: 0,id,text,author,subreddit,link_id,parent_id,created_utc
0,eew5j0j,That game hurt.,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0
3,eeibobj,Man I love reddit.,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0


In [19]:
#Grouping the values of agg_df2: I will be leaving the first ID, this has to be reviewed during the EDA
df_agg2_grouped = agg_df2.groupby("id").first().reset_index()

#Sanity check:
df_agg2_grouped.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58009 entries, 0 to 58008
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           58009 non-null  object 
 1   text         58009 non-null  object 
 2   author       58009 non-null  object 
 3   subreddit    58009 non-null  object 
 4   link_id      58009 non-null  object 
 5   parent_id    58009 non-null  object 
 6   created_utc  58009 non-null  float64
dtypes: float64(1), object(6)
memory usage: 3.1+ MB


In [20]:
#Merging both aggregate DFs
agg_df3 = pd.merge(df_agg2_grouped, agg_df, how="left", on="id")

#Sanity check:
agg_df3.head(10)

Unnamed: 0,id,text,author,subreddit,link_id,parent_id,created_utc,admiration,amusement,anger,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eczazk6,Fast as [NAME] will carry me. Seriously uptown to downtown via Claiborne or broad isn’t that bad,Uptownorbust,NewOrleans,t3_abbbq3,t1_eczavr9,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,eczb07q,You blew it. They played you like a fiddle.,ima_coder,self,t3_abca3b,t3_abca3b,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,eczb4bm,TL;DR No more Superbowls for [NAME]. Get ready for another winning season that ends in disappointment.,social_psycho,steelers,t3_ab8t8l,t3_ab8t8l,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,eczb527,So much time saved. Not.,TouristsOfNiagara,Roadcam,t3_abal7g,t3_abal7g,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,eczb6r7,"Emotes have a ridiculous amount of effort put into them, which only makes the fact that nobody uses them sadder",Reggiardito,Blackops4,t3_abasbo,t1_ecz9n7h,1546301000.0,0,0,0,...,0,0,0,0,1,0,0,1,0,2
5,eczb770,Just life.. I feel like i'm just a parasite that should be removed.,itsuharo,depression,t3_abcbco,t1_ecz729w,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,2,0,1
6,eczbbmd,Thanks for the advice man ! Have a great New Year’s Eve,hypersito,dxm,t3_ab5vxi,t1_ecz71ll,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,eczbbwt,*they were not* 😂 perfect. I’m sure they were - that mental image is pretty awesome lol,LadyKingsella,AskWomen,t3_ab9xxa,t1_ecz9qh5,1546301000.0,0,2,0,...,0,0,0,0,1,0,0,0,0,1
8,eczbdg4,Glad to hear it. You deserve your best life without that abuse and negativity.,Jehosheba,raisedbynarcissists,t3_ab5yay,t1_ecz8ca7,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,eczbe89,To be fair you haev to have a very high iq to understand this meme,Unidan_nadinU,4PanelCringe,t3_ab6q9o,t1_ecyu248,1546301000.0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [21]:
#Binarizing the emotions:
emotion_binarizer(agg_df3, emotions_list)



Unnamed: 0,id,text,author,subreddit,link_id,parent_id,created_utc,admiration,amusement,anger,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eczazk6,Fast as [NAME] will carry me. Seriously uptown to downtown via Claiborne or broad isn’t that bad,Uptownorbust,NewOrleans,t3_abbbq3,t1_eczavr9,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,eczb07q,You blew it. They played you like a fiddle.,ima_coder,self,t3_abca3b,t3_abca3b,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,eczb4bm,TL;DR No more Superbowls for [NAME]. Get ready for another winning season that ends in disappointment.,social_psycho,steelers,t3_ab8t8l,t3_ab8t8l,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#New column "total_emotions":
total_emotions(agg_df3, emotions_list)

Unnamed: 0,id,text,author,subreddit,link_id,parent_id,created_utc,admiration,amusement,anger,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,total_emotions
0,eczazk6,Fast as [NAME] will carry me. Seriously uptown to downtown via Claiborne or broad isn’t that bad,Uptownorbust,NewOrleans,t3_abbbq3,t1_eczavr9,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,eczb07q,You blew it. They played you like a fiddle.,ima_coder,self,t3_abca3b,t3_abca3b,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,eczb4bm,TL;DR No more Superbowls for [NAME]. Get ready for another winning season that ends in disappointment.,social_psycho,steelers,t3_ab8t8l,t3_ab8t8l,1546301000.0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
#Revieweing the total emotions:
print(agg_df3["total_emotions"].value_counts())

#Reviewing the maximum number:
max_emotions = agg_df3["total_emotions"].max()

#List of the texts with max emotions:
agg_df3[agg_df3["total_emotions"] == max_emotions]


2     20224
3     12348
4      9054
1      8512
5      5143
6      1946
7       601
8       129
9        34
10       10
11        4
13        2
12        2
Name: total_emotions, dtype: int64


Unnamed: 0,id,text,author,subreddit,link_id,parent_id,created_utc,admiration,amusement,anger,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,total_emotions
10096,edelx9y,Two or three anti depressants before I told them a lie about how I tried my moms valium and it worked,Bluechimp1,benzodiazepines,t3_acxq4h,t1_ede5mxn,1546802000.0,1,0,0,...,1,1,1,1,0,1,0,0,1,13
11414,edgmob1,"Welcome to racial integration, where your civility standards are racist and you better shut up or else.",5GcZiD42zhg8byjEpbPI,TrueOffMyChest,t3_ad9ju5,t3_ad9ju5,1546856000.0,1,0,1,...,1,0,0,0,0,0,1,0,0,13


In [24]:
#Checkpoint of the aggregate DF
agg_df4 = agg_df3.copy()

### Cleaning created UTC

- Since I consolidated the IDs, the created_utc no longer makes sense. So I will drop it in the aggregate DF
- In the non Aggregate DF I will change the data type to make it more readable for the EDA in the next notebook

In [25]:
#Deleting the utc from the aggregate 
agg_df4.drop("created_utc", inplace=True, axis= 1)

#Sanity check:
agg_df4.head(3)

Unnamed: 0,id,text,author,subreddit,link_id,parent_id,admiration,amusement,anger,annoyance,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,total_emotions
0,eczazk6,Fast as [NAME] will carry me. Seriously uptown to downtown via Claiborne or broad isn’t that bad,Uptownorbust,NewOrleans,t3_abbbq3,t1_eczavr9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,eczb07q,You blew it. They played you like a fiddle.,ima_coder,self,t3_abca3b,t3_abca3b,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,eczb4bm,TL;DR No more Superbowls for [NAME]. Get ready for another winning season that ends in disappointment.,social_psycho,steelers,t3_ab8t8l,t3_ab8t8l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
#Changing the datatype of the non aggregated
raw_df6["created_utc"] = pd.to_datetime(raw_df6["created_utc"], unit="s")

#Sanity check:
raw_df6.head(2)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,admiration,amusement,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,total_emotions
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,2019-01-25 01:50:39,1,0,0,...,0,0,0,0,0,0,1,0,0,1
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,2019-01-02 11:15:44,37,0,0,...,0,0,0,0,0,0,0,0,1,1


### Saving the file

In [27]:
#Saving the aggregate DF
agg_df4.to_csv('../data/aggregate_post_wrangling.csv')


In [28]:
#Savving the non Aggregate DF
raw_df6.to_csv('../data/raw_post_wrangling.csv')