In [23]:
import pandas as pd
import json
import glob
import itertools
import nltk
import numpy as np
from collections import defaultdict

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/peterzhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

To do:
  - https://github.com/sfu-discourse-lab/SOCC
  - https://nextit-public.s3-us-west-2.amazonaws.com/rsics.html#all95data95by95thresholdcsv

## Open Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her open-domain deception paper.

In [32]:
input_file = 'data/OpenDeception/7Truth7LiesDataset.csv'
df = pd.read_csv(input_file, quotechar="'", escapechar="\\")
df.head()

Unnamed: 0,id,_gender,age,education,country,text,class
0,1_f_l_1,Female,26,Bachelors degree,Canada,There is a great deal of truth to the anti-vax...,lie
1,1_f_l_2,Female,26,Bachelors degree,Canada,Jenny mccarthy is a learned doctor who deserve...,lie
2,1_f_l_3,Female,26,Bachelors degree,Canada,Driving doesn't really require any practice.,lie
3,1_f_l_4,Female,26,Bachelors degree,Canada,Drinking and driving is a winning and safe com...,lie
4,1_f_l_5,Female,26,Bachelors degree,Canada,Good hygiene isn't really important or attract...,lie


In [33]:
data = {}
data['lie'] = list(df[df['class']=='lie']['text']) # positive denotes lie
data['truth'] = list(df[df['class']=='truth']['text']) # negative denotes truth

In [34]:
output_file = 'output/open_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Fake News

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her fake news paper.

In [35]:
files = glob.glob('data/fakeNewsDatasets/fakeNewsDataset/**/*.txt')
legit, fake = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read()
        if 'legit' in file:
            legit.append(contents)
        else:
            fake.append(contents)

In [36]:
data = {}
data['legit'] = legit
data['fake'] = fake

In [37]:
output_file = 'output/fake_news.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Real Life Deception

Downloaded from Rada Mihalcea's [website](https://web.eecs.umich.edu/~mihalcea/downloads.html), based on her real life deception paper.

In [38]:
files = glob.glob('data/RealLifeDeception/Transcription/**/*.txt')
truth, lie = [], []
for file in files:
    with open(file, 'r') as f:
        contents = f.read().replace('<player>', 'the player')
        if 'truth' in file:
            truth.extend(contents)
        else:
            lie.extend(contents)

In [39]:
data = {}
data['truth'] = truth # lie
data['lie'] = lie # truth

In [40]:
output_file = 'output/real_life_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Football

Taken from Merullo et al.'s Football Commentary [dataset](https://arxiv.org/abs/1909.03343).

In [41]:
input_file = 'data/football/football_15.json'
with open(input_file, 'r') as f:
    js_data = json.load(f)

In [42]:
white, nonwhite = [], []
for instance in js_data.values():
    race = instance['label']['race']
    commentary = ' '.join(instance['mention'])
    if race == 'white':
        white.append(commentary)
    else:
        nonwhite.append(commentary)

In [43]:
data = {}
data['white'] = white # white
data['nonwhite'] = nonwhite # nonwhite
output_file = 'output/football.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Professor Gender

Pulled from RateMyProfessor.

In [44]:
input_file = 'data/profgender/full-data.txt'
df = pd.read_csv(input_file)

In [45]:
data = {}
data['woman'] = list(df[df['Professor Gender']=='W']['Comment Text']) # positive denotes lie
data['man'] = list(df[df['Professor Gender']=='M']['Comment Text']) # negative denotes truth

In [46]:
output_file = 'output/prof_gender.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Parenting

Read sentences pulled from various parenting topics from [Gao et al.](https://dl.acm.org/doi/10.1145/3411764.3445203).

In [47]:
input_file = 'data/parenting/0527_reddit_1300_parenting_clean.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,text,label,topics
0,So I participated in the survey re: exclusive ...,1,breastfeeding
1,I've started researching what pumps my insuran...,1,breastfeeding
2,Three and a half year old while listening to E...,1,
3,"About a week ago, my 2 1/2 year old started co...",1,"economy,child product"
4,When is it positive to say your kid does not l...,1,


In [48]:
df = df.dropna()
topics = set(itertools.chain.from_iterable(df['topics'].str.split(',')))
data = {}
for topic in topics:
    data[topic] = list(df[df['topics'].str.contains(topic)]['text'])

In [49]:
output_file = 'output/parenting.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Polarization

From Jerry Wei's news slant [dataset](https://github.com/JerryWei03/NewB).

In [50]:
data = {}
lib_file = 'data/NewB-master/liberal.txt'
with open(lib_file, 'r') as file:
    data['lib'] = list(map(lambda x: x[2:], file.readlines()))
con_file = 'data/NewB-master/conservative.txt'
with open(con_file, 'r') as file:
    data['con'] = list(map(lambda x: x[2:], file.readlines()))


In [51]:
output_file = 'output/news_slant.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Ukraine Bias

Annotated data from [Farber et al.](https://github.com/michaelfaerber/ukraine-news-bias) on news coverage during the Ukraine crisis.

In [52]:
input_file = 'data/ukraine/sentences-with-binary-labels-bias.csv'
df = pd.read_csv(input_file, header=None)
df.head()

Unnamed: 0,0,1
0,Russia claims thousands fleeing Ukraine,0
1,"Russia says 143,000 Ukrainians have already le...",0
2,Thousands of Ukrainians are fleeing across the...,0
3,"According to the border services, since the be...",0
4,The head of the citizenship department of the ...,0


In [53]:
data = {}
data['bias'] = list(df[df[1]==1][0])
data['no bias'] = list(df[df[1]==0][0])

In [54]:
output_file = 'output/ukraine.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Essays

Automated essay scoring [data](https://www.kaggle.com/c/asap-aes/data?select=training_set_rel3.xlsx) from the Hewlett Foundaiton.

In [55]:
input_file = 'data/papers/training_set_rel3.tsv'
df = pd.read_csv(input_file, sep='\t', encoding='latin-1')
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [56]:
data = {}
data['bad'] = list(df[df['domain1_score'] < 15]['essay'])
data['mediocre'] = list(df[(15 <= df['domain1_score']) & (df['domain1_score'] < 25)]['essay'])
data['good'] = list(df[(25 <= df['domain1_score']) & (df['domain1_score'] < 35)]['essay'])
data['great'] = list(df[35 <= df['domain1_score']]['essay'])

In [57]:
output_file = 'output/essays.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Diplomacy

From the [Diplomacy project](https://sites.google.com/view/qanta/projects/diplomacy) dataset.

In [58]:
files = glob.glob('data/diplomacy/*.txt')
data = {}
data['truth'] = []
data['lie'] = []
for file in files:
    df = pd.read_json(file, lines=True)
    messages = list(itertools.chain.from_iterable(pd.read_json(files[0], lines=True)['messages']))
    labels = list(itertools.chain.from_iterable(pd.read_json(files[0], lines=True)['sender_labels']))
    df = pd.DataFrame({'message':messages, 'label':labels})
    data['truth'].extend(list(df[df['label']==True]['message']))
    data['lie'].extend(list(df[df['label']==False]['message']))

In [59]:
output_file = 'output/diplomacy.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## News Headlines

From the ABC "million news headlines" [dataset](https://www.kaggle.com/therohk/million-headlines), "spam clickbait catalog" [dataset](https://www.kaggle.com/therohk/examine-the-examiner), and India news headlines [dataset](https://www.kaggle.com/therohk/india-headlines-news-dataset).

In [60]:
headline_files = [
    ('data/abc_headlines/abcnews-date-text.csv', 'output/abc_headlines.json'),
    ('data/examiner_headlines/examiner-date-text.csv', 'output/examiner_headlines.json'),
    ('data/india_headlines/india-news-headlines.csv', 'output/india_headlines.json')
]
for input_file, output_file in headline_files:
    df = pd.read_csv(input_file)
    df['year'] = df['publish_date'].astype(str).str[:4].astype(int)
    data = {}
    for year in df['year'].unique():
        data[str(year)] = list(df[df['year']==year]['headline_text'])
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile)

## Search Relevance

From the "Home Depot Product Search Relevance" [dataset](https://www.kaggle.com/c/home-depot-product-search-relevance/data?select=train.csv.zip).

In [61]:
input_file = 'data/search_relevance/train.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [62]:
data = {}
data['low relevance'] = list(df[df['relevance'] < 2.5]['search_term'])
data['high relevance'] = list(df[df['relevance'] >= 2.5]['search_term'])

In [63]:
output_file = 'output/search_relevance.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Product Reviews

Reviews of products from Jianmo Ni's [Amazon Review Data](https://nijianmo.github.io/amazon/index.html#complete-data) dataset.

In [64]:
review_files = [
    ('data/amazon_reviews/AMAZON_FASHION_5.json','output/amazon_fashion_reviews.json'),
    ('data/amazon_reviews/All_Beauty_5.json','output/beauty_reviews.json'),
    ('data/amazon_reviews/Appliances_5.json','output/appliances_reviews.json'),
    ('data/amazon_reviews/Arts_Crafts_and_Sewing_5.json','output/arts_crafts_reviews.json'),
    ('data/amazon_reviews/Cell_Phones_and_Accessories_5.json','output/phone_reviews.json'),
    ('data/amazon_reviews/Automotive_5.json','output/automotive_reviews.json')
]
for input_file, output_file in review_files:
    print(input_file)
    df = pd.read_json(input_file, lines=True)
    data = {}
    for rating in df['overall'].unique():
        data[str(rating)] = list(df[df['overall'] == rating]['reviewText'].astype(str))
    with open(output_file, 'w') as outfile:
        json.dump(data, outfile)

data/amazon_reviews/AMAZON_FASHION_5.json
data/amazon_reviews/All_Beauty_5.json
data/amazon_reviews/Appliances_5.json
data/amazon_reviews/Arts_Crafts_and_Sewing_5.json
data/amazon_reviews/Cell_Phones_and_Accessories_5.json


## Mafia Deception

In [11]:
input_file = 'data/mafia/docs.pkl'
df = pd.read_pickle(input_file, 'gzip')
df.head()

Unnamed: 0,game_id,author,content,inserted_at,updated_at,scum,slot_id,words,wc,punc,...,neg_em_ratio,not_ratio,anger_ratio,sensory_ratio,cog_ratio,insight_ratio,motion_ratio,tent_ratio,spp_ratio,quant_ratio
0,28480,ThAdmiral,yep /!@ \nWhat if he is scum? This option woul...,2013-05-15 01:44:00,2013-07-22 00:17:00,False,240,"[yep, what, if, he, is, scum, this, option, wo...",2688,"[/!@, ?, ', ., :, ', ., /!@, ., ., ., /!@, (, ...",...,0.013393,0.018601,0.005952,0.017857,0.081101,0.024926,0.004836,0.029762,0.012649,0.013393
1,5845,Khelvaster,"[s]Gah...I don't know anyone. Vote: SPAG, sinc...",2007-08-02 16:33:00,2007-08-27 20:07:00,False,5210,"[s, gah, i, don, t, know, anyone, vote, spag, ...",863,"[[, ], ..., ', ., :, ,, ', .[/, ], ,, ', ..., ...",...,0.006952,0.031286,0.001159,0.016222,0.077636,0.017381,0.006952,0.034762,0.02781,0.015064
2,30779,Elyse,/confirm /!@ ANNOUNCEMENT\nThere are NO jester...,2013-08-13 15:28:00,2013-10-11 00:52:00,False,447,"[confirm, announcement, there, are, no, jester...",3194,"[/, /!@, ,, ., ', !, /!@, ., :, /!@, ', /!@, '...",...,0.017533,0.0335,0.003444,0.028491,0.100188,0.027552,0.005322,0.022855,0.02943,0.011271
3,10744,Simpor,/confirm /!@ Vote: AWA\n\nFor being the last o...,2009-03-04 21:42:00,2009-03-14 21:23:00,True,6238,"[confirm, vote, awa, for, being, the, last, on...",154,"[/, /!@, :, .., /!@, ,, ?, !, /!@, ', ,, ., ',...",...,0.006494,0.025974,0.006494,0.012987,0.090909,0.019481,0.0,0.032468,0.012987,0.038961
4,10617,blakebowling,"Confirm /!@ Vote: blakebowling /!@ Unvote, Vot...",2009-02-20 02:08:00,2009-03-01 03:10:00,False,6200,"[confirm, vote, blakebowling, unvote, vote, at...",239,"[/!@, :, /!@, ,, :, ., ', ., /!@, /!@, ,, .[/,...",...,0.025105,0.037657,0.008368,0.016736,0.050209,0.012552,0.020921,0.020921,0.016736,0.0


In [12]:
data = {}
data['scum'] = list(df[df['scum']]['content'])
data['not scum'] = list(df[~df['scum']]['content'])

In [15]:
output_file = 'output/mafia_deception.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Stocks Reddit

From ["Daily News for Stock Market Prediction."](https://www.kaggle.com/datasets/aaron7sun/stocknews)

In [16]:
input_file = 'data/stocks_reddit/Combined_News_DJIA.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [17]:
data = {}
data['down'] = []
data['up'] = []
for col in df.columns:
    if "Top" in col:
        data['down'].extend(list(df[df['Label'] == 0][col]))
        data['up'].extend(list(df[df['Label'] == 1][col]))

In [21]:
output_file = 'output/reddit_stocks.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)

## Unhealthy Conversations

From the [Unhealthy Comments Corpus](https://github.com/conversationai/unhealthy-conversations).

In [27]:
files = glob.glob('data/unhealthy_convo/*.csv')
data = defaultdict(list)
attributes = ['antagonize', 'condescending', 'dismissive', 'generalisation', 'generalisation_unfair', 'healthy', 'hostile', 'sarcastic']
for file in files:
    df = pd.read_csv(file)
    for attr in attributes:
        data[attr].extend(list(df[df[attr] == 1]['comment']))
        data['not_' + attr].extend(list(df[df[attr] == 0]['comment']))

In [31]:
output_file = 'output/unhealthy_convo.json'
with open(output_file, 'w') as outfile:
    json.dump(data, outfile)