In [1]:
import pandas as pd
import json
import glob
import itertools

To do:
- Mihalcea's multimodal deception: https://web.eecs.umich.edu/~mihalcea/downloads.html#OpenDeception
  

## Open Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her open-domain deception paper.

In [26]:
input_file = 'data/OpenDeception/7Truth7LiesDataset.csv'
df = pd.read_csv(input_file, quotechar="'", escapechar="\\")
df.head()

Unnamed: 0,id,_gender,age,education,country,text,class
0,1_f_l_1,Female,26,Bachelors degree,Canada,There is a great deal of truth to the anti-vax...,lie
1,1_f_l_2,Female,26,Bachelors degree,Canada,Jenny mccarthy is a learned doctor who deserve...,lie
2,1_f_l_3,Female,26,Bachelors degree,Canada,Driving doesn't really require any practice.,lie
3,1_f_l_4,Female,26,Bachelors degree,Canada,Drinking and driving is a winning and safe com...,lie
4,1_f_l_5,Female,26,Bachelors degree,Canada,Good hygiene isn't really important or attract...,lie


In [27]:
data = {}
data['lie'] = list(df[df['class']=='lie']['text']) # positive denotes lie
data['truth'] = list(df[df['class']=='truth']['text']) # negative denotes truth

In [28]:
output_file = 'output/open_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Fake News

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her fake news paper.

In [29]:
files = glob.glob('data/fakeNewsDatasets/fakeNewsDataset/**/*.txt')
legit, fake = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read()
        if 'legit' in file:
            legit.append(contents)
        else:
            fake.append(contents)

In [30]:
data = {}
data['legit'] = legit
data['fake'] = fake

In [31]:
output_file = 'output/fake_news.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Real Life Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her real life deception paper.

In [32]:
files = glob.glob('data/RealLifeDeception/Transcription/**/*.txt')
truth, lie = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read()
        if 'truth' in file:
            truth.append(contents)
        else:
            lie.append(contents)

In [33]:
data = {}
data['truth'] = truth # lie
data['lie'] = lie # truth

In [35]:
output_file = 'output/real_life_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Football

Taken from Merullo et al.'s Football Commentary [dataset](https://arxiv.org/abs/1909.03343).

In [38]:
input_file = 'data/football/football_15.json'
with open(input_file, 'r') as f:
    js_data = json.load(f)

In [39]:
white, nonwhite = [], []
for instance in js_data.values():
    race = instance['label']['race']
    commentary = ' '.join(instance['mention'])
    if race == 'white':
        white.append(commentary)
    else:
        nonwhite.append(commentary)

In [40]:
data = {}
data['white'] = white # white
data['nonwhite'] = nonwhite # nonwhite
output_file = 'output/football.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Professor Gender

Pulled from RateMyProfessor.

In [43]:
input_file = 'data/profgender/full-data.txt'
df = pd.read_csv(input_file)

In [44]:
data = {}
data['woman'] = list(df[df['Professor Gender']=='W']['Comment Text']) # positive denotes lie
data['man'] = list(df[df['Professor Gender']=='M']['Comment Text']) # negative denotes truth

In [45]:
output_file = 'output/prof_gender.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Parenting

Read sentences pulled from various parenting topics from [Gao et al.](https://dl.acm.org/doi/10.1145/3411764.3445203).

In [46]:
input_file = 'data/parenting/0527_reddit_1300_parenting_clean.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,text,label,topics
0,So I participated in the survey re: exclusive ...,1,breastfeeding
1,I've started researching what pumps my insuran...,1,breastfeeding
2,Three and a half year old while listening to E...,1,
3,"About a week ago, my 2 1/2 year old started co...",1,"economy,child product"
4,When is it positive to say your kid does not l...,1,


In [47]:
df = df.dropna()
topics = set(itertools.chain.from_iterable(df['topics'].str.split(',')))
data = {}
for topic in topics:
    data[topic] = list(df[df['topics'].str.contains(topic)]['text'])

In [48]:
output_file = 'output/parenting.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Polarization

From Jerry Wei's news slant [dataset](https://github.com/JerryWei03/NewB).

In [50]:
data = {}
lib_file = 'data/NewB-master/liberal.txt'
with open(lib_file, 'r') as file:
    data['lib'] = list(map(lambda x: x[2:], file.readlines()))
con_file = 'data/NewB-master/conservative.txt'
with open(con_file, 'r') as file:
    data['con'] = list(map(lambda x: x[2:], file.readlines()))


In [51]:
output_file = 'output/news_slant.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Ukraine Bias

Annotated data from [Farber et al.](https://github.com/michaelfaerber/ukraine-news-bias) on news coverage during the Ukraine crisis.

In [53]:
input_file = 'data/ukraine/sentences-with-binary-labels-bias.csv'
df = pd.read_csv(input_file, header=None)
df.head()

Unnamed: 0,0,1
0,Russia claims thousands fleeing Ukraine,0
1,"Russia says 143,000 Ukrainians have already le...",0
2,Thousands of Ukrainians are fleeing across the...,0
3,"According to the border services, since the be...",0
4,The head of the citizenship department of the ...,0


In [56]:
data = {}
data['bias'] = list(df[df[1]==1][0]) 
data['no bias'] = list(df[df[1]==0][0]) 

In [57]:
output_file = 'output/ukraine.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Essays

Automated essay scoring [data](https://www.kaggle.com/c/asap-aes/data?select=training_set_rel3.xlsx) from the Hewlett Foundaiton.

In [2]:
input_file = 'data/papers/training_set_rel3.tsv'
df = pd.read_csv(input_file, sep='\t', encoding='latin-1')
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [16]:
data = {}
data['bad'] = list(df[df['domain1_score'] < 15]['essay'])
data['mediocre'] = list(df[(15 <= df['domain1_score']) & (df['domain1_score'] < 25)]['essay'])
data['good'] = list(df[(25 <= df['domain1_score']) & (df['domain1_score'] < 35)]['essay'])
data['great'] = list(df[35 <= df['domain1_score']]['essay'])

In [22]:
output_file = 'output/paper.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)