In [97]:
import pandas as pd
import json
import glob
import itertools

To do:
- Mihalcea's multimodal deception: https://web.eecs.umich.edu/~mihalcea/downloads.html#OpenDeception
  

## Open Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her open-domain deception paper.

In [13]:
input_file = 'data/OpenDeception/7Truth7LiesDataset.csv'
df = pd.read_csv(input_file, quotechar="'", escapechar="\\")
df.head()

Unnamed: 0,id,_gender,age,education,country,text,class
0,1_f_l_1,Female,26,Bachelors degree,Canada,There is a great deal of truth to the anti-vax...,lie
1,1_f_l_2,Female,26,Bachelors degree,Canada,Jenny mccarthy is a learned doctor who deserve...,lie
2,1_f_l_3,Female,26,Bachelors degree,Canada,Driving doesn't really require any practice.,lie
3,1_f_l_4,Female,26,Bachelors degree,Canada,Drinking and driving is a winning and safe com...,lie
4,1_f_l_5,Female,26,Bachelors degree,Canada,Good hygiene isn't really important or attract...,lie


In [24]:
data = {}
data['pos'] = list(df[df['class']=='lie']['text']) # positive denotes lie
data['neg'] = list(df[df['class']=='truth']['text']) # negative denotes truth

In [25]:
output_file = 'output/open_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Fake News

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her fake news paper.

In [44]:
files = glob.glob('data/fakeNewsDatasets/fakeNewsDataset/**/*.txt')
pos, neg = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read()
        if 'legit' in file:
            neg.append(contents)
        else:
            pos.append(contents)

In [47]:
data = {}
data['pos'] = pos
data['neg'] = neg

In [48]:
output_file = 'output/fake_news.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Real Life Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her real life deception paper.

In [49]:
files = glob.glob('data/RealLifeDeception/Transcription/**/*.txt')
pos, neg = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read()
        if 'truth' in file:
            neg.append(contents)
        else:
            pos.append(contents)

In [52]:
data = {}
data['pos'] = pos # lie
data['neg'] = neg # truth

In [53]:
output_file = 'output/real_life_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Football

Taken from Merullo et al.'s Football Commentary [dataset](https://arxiv.org/abs/1909.03343).

In [59]:
input_file = 'data/football/football_15.json'
with open(input_file, 'r') as f:
    js_data = json.load(f)

In [67]:
pos, neg = [], []
for instance in js_data.values():
    race = instance['label']['race']
    commentary = ' '.join(instance['mention'])
    if race == 'white':
        pos.append(commentary)
    else:
        neg.append(commentary)

In [69]:
data = {}
data['pos'] = pos # white
data['neg'] = neg # nonwhite
output_file = 'output/football.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Professor Gender

Pulled from RateMyProfessor.

In [73]:
input_file = 'data/profgender/full-data.txt'
df = pd.read_csv(input_file)

In [74]:
data = {}
data['pos'] = list(df[df['Professor Gender']=='W']['Comment Text']) # positive denotes lie
data['neg'] = list(df[df['Professor Gender']=='M']['Comment Text']) # negative denotes truth

In [75]:
output_file = 'output/prof_gender.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Parenting

Read sentences pulled from various parenting topics from [Gao et al.](https://dl.acm.org/doi/10.1145/3411764.3445203).

In [94]:
input_file = 'data/parenting/0527_reddit_1300_parenting_clean.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,text,label,topics
0,So I participated in the survey re: exclusive ...,1,breastfeeding
1,I've started researching what pumps my insuran...,1,breastfeeding
2,Three and a half year old while listening to E...,1,
3,"About a week ago, my 2 1/2 year old started co...",1,"economy,child product"
4,When is it positive to say your kid does not l...,1,


In [126]:
df = df.dropna()
topics = set(itertools.chain.from_iterable(df['topics'].str.split(',')))
data = {}
for topic in topics:
    data[topic] = list(df[df['topics'].str.contains(topic)]['text'])

In [129]:
output_file = 'output/parenting.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Earnings Call

Pull from Li et al.'s [earnings call dataset](https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction).