In [20]:
# the socialchemistry dataset is used for another task: yes/no QA
# rules of thumb (rot's) are used, and the judgement is either yes or no
# so the rot is either true (yes) or false (no)
# judgement can be something like 'yes, its okay', 'no, its okay', 'no, its bad', 'yes, its bad'

In [21]:
import pandas as pd

In [22]:
df = pd.read_csv('social-chem-101.v1.0.tsv', sep='\t', header=0)
df

Unnamed: 0,area,m,split,rot-agree,rot-categorization,rot-moral-foundations,rot-char-targeting,rot-bad,rot-judgment,action,...,action-char-involved,action-hypothetical,situation,situation-short-id,rot,rot-id,rot-worker-id,breakdown-worker-id,n-characters,characters
0,amitheasshole,1,train,4.0,advice,loyalty-betrayal,char-1,0,it's bad,doing something that causes other people to lo...,...,char-1,hypothetical,losing trust in my friend,reddit/amitheasshole/aypvmz,It's bad to do something that causes other peo...,rot/reddit/amitheasshole/aypvmz/3K5TEWLKGYQFYA...,127,0,2,narrator|my friend
1,amitheasshole,1,dev,3.0,social-norms,loyalty-betrayal,char-0,0,expected,people participating in the big events in thei...,...,char-0,explicit-no,saying no to being a bridesmaid at a friend's ...,reddit/amitheasshole/9tzn0z,People are expected to participate in the big ...,rot/reddit/amitheasshole/9tzn0z/3EG49X351XRR9F...,89,39,3,narrator|a bridesmaid|a friend
2,amitheasshole,1,test,3.0,social-norms,care-harm|loyalty-betrayal,char-1,0,Partners should,Listening to each other's issues.,...,char-1,probable,telling my boyfriend I am bored and unhappy at...,reddit/amitheasshole/a1311q,Partners should listen to each other's issues.,rot/reddit/amitheasshole/a1311q/3JV9LGBJWWT6CZ...,111,145,2,narrator|my boyfriend
3,amitheasshole,1,dev,2.0,advice,loyalty-betrayal,char-0,0,it's okay,needing space from family.,...,char-0,probable,not wanting to be around my family,reddit/amitheasshole/akkcpn,It is okay to need space from family.,rot/reddit/amitheasshole/akkcpn/3R2PKQ87NZNW8N...,30,0,2,narrator|my family
4,amitheasshole,1,train,4.0,advice,care-harm,char-0,0,it's good,keeping things clean.,...,char-0,explicit,washing my cat's bowls in the kitchen sink,reddit/amitheasshole/aof4ml,It's good to keep things clean.,rot/reddit/amitheasshole/aof4ml/3HMIGG0U4OL3DY...,42,49,1,narrator
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355917,rocstories,50,analysis,4.0,description,,char-0,0,it's economical,Making dinner as opposed to eating out.,...,char-0,explicit,"After I cleaned everything, I made dinner and ...",rocstories/ab2bd65a-76b5-4c20-ab03-788deb38991...,Making dinner is economical as opposed to eati...,rot/rocstories/ab2bd65a-76b5-4c20-ab03-788deb3...,94,104,1,narrator
355918,rocstories,50,analysis,3.0,morality-ethics|description,fairness-cheating,char-0,0,is economical,Making dinner as opposed to eating out.,...,char-0,explicit,"After I cleaned everything, I made dinner and ...",rocstories/ab2bd65a-76b5-4c20-ab03-788deb38991...,Making dinner is economical as opposed to eati...,rot/rocstories/ab2bd65a-76b5-4c20-ab03-788deb3...,94,17,1,narrator
355919,rocstories,50,analysis,3.0,social-norms,care-harm|sanctity-degradation,char-0,0,it is,Making dinner is economical as opposed to eati...,...,char-0,explicit,"After I cleaned everything, I made dinner and ...",rocstories/ab2bd65a-76b5-4c20-ab03-788deb38991...,Making dinner is economical as opposed to eati...,rot/rocstories/ab2bd65a-76b5-4c20-ab03-788deb3...,94,23,1,narrator
355920,rocstories,50,analysis,4.0,description,,char-0,0,economical,Making dinner as opposed to eating out.,...,char-0,explicit,"After I cleaned everything, I made dinner and ...",rocstories/ab2bd65a-76b5-4c20-ab03-788deb38991...,Making dinner is economical as opposed to eati...,rot/rocstories/ab2bd65a-76b5-4c20-ab03-788deb3...,94,44,1,narrator


In [23]:
# drop columns we dont need; we need rot, rot-judgement (what comes after 'yes'/'no') and rot-agree (how many people would agree with the rot (so whether answer is yes or no))
df = df[['rot','rot-agree','rot-judgment']]
df

Unnamed: 0,rot,rot-agree,rot-judgment
0,It's bad to do something that causes other peo...,4.0,it's bad
1,People are expected to participate in the big ...,3.0,expected
2,Partners should listen to each other's issues.,3.0,Partners should
3,It is okay to need space from family.,2.0,it's okay
4,It's good to keep things clean.,4.0,it's good
...,...,...,...
355917,Making dinner is economical as opposed to eati...,4.0,it's economical
355918,Making dinner is economical as opposed to eati...,3.0,is economical
355919,Making dinner is economical as opposed to eati...,3.0,it is
355920,Making dinner is economical as opposed to eati...,4.0,economical


In [24]:
# rot agree numbers mean this: 0 = < 1% of people agree, 1 = 5-25%, 2 = 50%, 3 = 70-95%, 4 = >99%
# to keep things very clearly seperated (theres nothing between yes and no), we should get rid of 2, which means 50% of people agree
# 0 and 1 would then be no, 3 and 4 would be yes

In [25]:
df = df.rename(columns = {'rot-agree':'agreement', 'rot-judgment' : 'judgement'})

In [26]:
df.drop(df[df.agreement == 2.0].index, inplace=True)

In [27]:
df.dtypes

rot           object
agreement    float64
judgement     object
dtype: object

In [28]:
df["agreement"].replace({0: "no", 1: "no", 3 : 'yes', 4 : 'yes'}, inplace=True)

In [29]:
df.sample(n=10)
# cant seem to find any no's? lets check

Unnamed: 0,rot,agreement,judgement
133086,It is ok to be sad when your boyfriend passes.,yes,It is ok
56126,It's wrong to tell your partner how much you s...,yes,it's wrong
307189,It's good to wish people a good morning.,yes,It's good
34865,You should take your partner's advice.,yes,you should
299300,It is wrong to touch something that isn't yours.,yes,wrong
102280,It's understandable to be frustrated by an abu...,no,It's understandable
54761,"It's expected that people are kind, understand...",yes,It's expected
62524,People should keep their possession neat and t...,yes,People should
172220,It's wrong to make an argument out of somethin...,no,{}
178508,It's okay to travel without your spouse.,yes,It's okay


In [30]:
df['agreement'].value_counts()
# mostly yes, then! now lets create train, dev and test sets

yes    293743
no       8547
Name: agreement, dtype: int64

In [31]:
import numpy as np
df_trn, df_dev, df_tst = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])

In [32]:
socialchemistry_yesno_train = df_trn

In [33]:
socialchemistry_yesno_dev = df_dev

In [34]:
socialchemistry_yesno_test = df_tst

In [35]:
socialchemistry_yesno_train.to_csv(r'C:\Users\maret/Thesis/prepped_csv/yesno/socialchemistry_yesno_train.csv')
socialchemistry_yesno_dev.to_csv(r'C:\Users\maret/Thesis/prepped_csv/yesno/socialchemistry_yesno_dev.csv')
socialchemistry_yesno_test.to_csv(r'C:\Users\maret/Thesis/prepped_csv/yesno/socialchemistry_yesno_test.csv')