Convert all datasets to a standard 2-columns format ('text' and 'label')

In [16]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups

## 1. Fake News datasets

In [2]:
os.chdir('datasets/fake_news')

In [34]:
#Fake and Real News dataset
#TO DO 
#   - keep the title or just the text content ?
real_news = pd.read_csv('fake_and_real_news/True.csv',usecols = ['title','text'])
fake_news = pd.read_csv('fake_and_real_news/Fake.csv',usecols = ['title','text'])

real_news['label'] = np.full(shape=(real_news.shape[0],1),fill_value=1)
fake_news['label'] = np.zeros(shape=(fake_news.shape[0],1))

fake_real_news = pd.concat([real_news,fake_news])

In [35]:
fake_real_news.head() #Train Test Split needs to be done manually (guidelines in the paper?)

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1.0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1.0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1.0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1.0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1.0


In [42]:
#FakeNewsNet
# 2 datasets : PolitiFacts (small) and Gossipcop (large)
# Politifacts

def retrieve_text(path):
    text = []
    no_json_count = 0
    for folder in tqdm(os.listdir(path)):
        if os.path.exists(path+folder+"/news content.json"):
            with open(path+folder+"/news content.json")  as f:
                text.append(json.load(f)['text'])
        else:
            no_json_count +=1
            text.append(' ')
    print("No json file for %s folders"%no_json_count)
    return text


real_path = "FakeNewsNet/code/fakenewsnet_dataset/politifact/real/"
fake_path = "FakeNewsNet/code/fakenewsnet_dataset/politifact/fake/"
real_text = retrieve_text(real_path)
fake_text = retrieve_text(fake_path)
print(' ')
print('There are %s real texts'%len(real_text))
print('There are %s fake texts'%len(fake_text))
label = [1] * len(real_text) + [0] * len(fake_text) 
politifact = pd.DataFrame({'text':real_text+fake_text,'label':label})

100%|██████████████████████████████████████████████████████████████████████████████| 624/624 [00:00<00:00, 2492.70it/s]


No json file for 70 folders


100%|██████████████████████████████████████████████████████████████████████████████| 432/432 [00:00<00:00, 2602.30it/s]

No json file for 31 folders
 
There are 624 real texts
There are 432 fake texts





In [31]:
politifact.head()

Unnamed: 0,text,label
0,,1
1,Roll Call Vote 111th Congress - 1st Session\n\...,1
2,At a press conference addressing Carrie Prejea...,1
3,,1
4,,1


In [41]:
politifact['text'].value_counts()[0:2]
#TO DO 
#Understand why there are 101 news folder without a json file at all
#Understand why there are 143 files without text while the paper says its 108

     143
     101
Name: text, dtype: int64

In [17]:
#LIAR
liar_train = pd.read_csv('liar/train.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})
liar_val = pd.read_csv('liar/val.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})
liar_test = pd.read_csv('liar/testn.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})

In [19]:
liar_train.head()

Unnamed: 0,label,text
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,2,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...


## 2. Topic Modelling datasets

In [26]:
os.chdir('../topic')

In [31]:
#20Newsgroup  
#https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset
#Files are downloaded once on the computer, after that they are loaded much faster
twentynews_train = fetch_20newsgroups(subset='train',
                           remove=('headers', 'footers', 'quotes') #option to remove metadata 
                          )
twentynews_test = fetch_20newsgroups(subset='test',
                           remove=('headers', 'footers', 'quotes') #option to remove metadata  
                          )    

In [34]:
twentynews_train_df = pd.DataFrame({'label':twentynews_train['target'],
                                     'text':twentynews_train['data']})
twentynews_test_df = pd.DataFrame({'label':twentynews_test['target'],
                                     'text':twentynews_test['data']})                                   

In [35]:
twentynews_train_df.head()

Unnamed: 0,label,text
0,7,I was wondering if anyone out there could enli...
1,4,A fair number of brave souls who upgraded thei...
2,4,"well folks, my mac plus finally gave up the gh..."
3,1,\nDo you have Weitek's address/phone number? ...
4,14,"From article <C5owCB.n3p@world.std.com>, by to..."


In [40]:
#AG News
agnews_train = pd.read_csv('ag_news/train.csv')
agnews_test = pd.read_csv('ag_news/test.csv')

In [41]:
agnews_train.head()

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


In [44]:
#Yahoo answers
#Text consists of the question title, question content and best answer. which elements should we keep?
#TO DO : merge the text
col_dict = {0:'label',1:'title',2:'question',3:'answer'}
yahoo_train = pd.read_csv('yahoo_answers/train.csv',
                          header=None).rename(columns=col_dict)
yahoo_test = pd.read_csv('yahoo_answers/test.csv',
                          header=None).rename(columns=col_dict)

In [60]:
yahoo_train.head()

Unnamed: 0,label,title,question,answer
0,5,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,6,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,3,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,7,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,7,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...


## 3. Sentiments analysis datasets

### 3.1. Emotion

In [46]:
os.chdir('../sentiment/emotion')

In [57]:
#Tweet Eval : Emotion dection
#Labels and text  are in separate  text files
eval_emotion_train = pd.DataFrame()
eval_emotion_val = pd.DataFrame()
eval_emotion_test = pd.DataFrame()
eval_emotion_train['label'] = pd.read_table('tweetEval/datasets/train_labels.txt',header=None)
eval_emotion_val['label'] = pd.read_table('tweetEval/datasets/val_labels.txt',header=None)
eval_emotion_test['label'] = pd.read_table('tweetEval/datasets/test_labels.txt',header=None)
eval_emotion_train['text']= pd.read_table('tweetEval/datasets/train_text.txt',header=None)
eval_emotion_val['text']= pd.read_table('tweetEval/datasets/val_text.txt',header=None)
eval_emotion_test['text'] = pd.read_table('tweetEval/datasets/test_text.txt',header=None)

In [59]:
eval_emotion_train.head()

Unnamed: 0,label,text
0,2,“Worry is a down payment on a problem you may ...
1,0,My roommate: it's okay that we can't spell bec...
2,1,No but that's so cute. Atsu was probably shy a...
3,0,Rooneys fucking untouchable isn't he? Been fuc...
4,3,it's pretty depressing when u hit pan on ur fa...


In [64]:
#CARER Emotion
carer_train = pd.read_csv('CARER/train.csv')
carer_val = pd.read_csv('CARER/val.csv')
carer_test = pd.read_csv('CARER/test.csv')

In [63]:
carer_train.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [68]:
#silicone (Daily Dialog Act)
silicone_train = pd.read_csv('silicone/train.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})
siliconer_val = pd.read_csv('silicone/val.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})
silicone_test = pd.read_csv('silicone/test.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})

In [69]:
silicone_train.head()

Unnamed: 0,text,label
0,"say , jim , how about going for a few beers af...",4
1,you know that is tempting but is really not go...,4
2,what do you mean ? it will help us to relax .,4
3,do you really think so ? i don't . it will jus...,4
4,i guess you are right.but what shall we do ? i...,4


In [74]:
#IMDb 
imdb_train = pd.read_csv('IMDb/train.csv')
imdb_test = pd.read_csv('IMDb/test.csv')

In [73]:
imdb_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


### 3.2. Polarity

In [75]:
os.chdir('../polarity')

In [76]:
#YELP
yelp_train = pd.read_csv('yelp/train.csv')
yelp_test = pd.read_csv('yelp/test.csv')

In [77]:
yelp_train.head()

Unnamed: 0,text,label
0,"Unfortunately, the frustration of being Dr. Go...",0
1,Been going to Dr. Goldberg for over 10 years. ...,1
2,I don't know what Dr. Goldberg was like before...,0
3,I'm writing this review to give you a heads up...,0
4,All the food is great here. But the best thing...,1


In [82]:
#sst2 
sst2_train = pd.read_csv('sst2/train.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})
sst2_val = pd.read_csv('sst2/val.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})
sst2_test = pd.read_csv('sst2/test.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})

In [83]:
sst2_train.head()

Unnamed: 0,text,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


###  3.3. Sarcasm

In [84]:
os.chdir('../sarcasm')

In [92]:
#Tweet Eval : Irony
#Parser Error with the train text file, the parsers skips three lines as a result. Further investigation needed
eval_irony_train = pd.DataFrame()
eval_irony_test = pd.DataFrame()
eval_irony_train['label'] = pd.read_table('tweetEval/train_labels.txt',header=None)
eval_irony_test['label'] = pd.read_table('tweetEval/test_labels.txt',header=None)
eval_irony_train['text']= pd.read_table('tweetEval/train_text.txt',header=None,error_bad_lines=False)
eval_irony_test['text'] = pd.read_table('tweetEval/test_text.txt',header=None)



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 1296: expected 1 fields, saw 21\nSkipping line 1754: expected 1 fields, saw 5\nSkipping line 2774: expected 1 fields, saw 5\n'


In [90]:
eval_irony_train.head() 

Unnamed: 0,label,text
0,1,seeing ppl walking w/ crutches makes me really...
1,0,"look for the girl with the broken smile, ask h..."
2,1,Now I remember why I buy books online @user #s...
3,1,@user @user So is he banded from wearing the c...
4,1,Just found out there are Etch A Sketch apps. ...


In [112]:
#SemEval 2018
#Task A Binary, Task B Multiclass (4)
semeval_train_taskA = pd.read_table('SemEval/datasets/train/SemEval2018-T3-train-taskA.txt',
                                    usecols=['Label','Tweet text']).rename(columns = {'Label':'label','Tweet text':'text'})
semeval_train_taskB = pd.read_table('SemEval/datasets/train/SemEval2018-T3-train-taskB.txt',
                                    usecols=['Label','Tweet text']).rename(columns = {'Label':'label','Tweet text':'text'})
semeval_test_taskA = pd.DataFrame()
semeval_test_taskB = pd.DataFrame()
semeval_test_taskA['text'] = pd.read_table('SemEval/datasets/test_TaskA/SemEval2018-T3_input_test_taskA.txt',
                                    usecols=['tweet text'])
semeval_test_taskB['text'] = pd.read_table('SemEval/datasets/test_TaskB/SemEval2018-T3_input_test_taskB.txt',
                                    usecols=['tweet text'])
semeval_test_taskA['label'] = pd.read_table('SemEval/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt',
                                    usecols=['Label'])
semeval_test_taskB['label'] = pd.read_table('SemEval/datasets/goldtest_TaskB/SemEval2018-T3_gold_test_taskB_emoji.txt',
                                    usecols=['Label'])

In [137]:
semeval_train_taskA.head()

Unnamed: 0,label,text
0,1,Sweet United Nations video. Just in time for C...
1,1,@mrdahl87 We are rumored to have talked to Erv...
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,1,I can't breathe! was chosen as the most notabl...


In [138]:
semeval_train_taskB.head()

Unnamed: 0,label,text
0,1,Sweet United Nations video. Just in time for C...
1,1,@mrdahl87 We are rumored to have talked to Erv...
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,2,I can't breathe! was chosen as the most notabl...


In [115]:
#SARC V1.0
#balanced version. There is also a much larger unbalanced version (several GBy)
sarc_train = pd.read_csv('SARC/train-balanced.csv',
                    sep='\t',
                    header=None,
                    usecols=[0,1]).rename(columns={0:'label',1:'text'})
sarc_test = pd.read_csv('SARC/test-balanced.csv',
                    sep='\t',
                    header=None,
                    usecols=[0,1]).rename(columns={0:'label',1:'text'})

In [116]:
sarc_train.head()

Unnamed: 0,label,text
0,0,"Dang dog, thanks"
1,0,to summon the powers of the flying spaghetti m...
2,0,i did that 3rd last 1 by accident last night
3,0,"He's insane, used him in DC, better than Blake..."
4,0,"Forgot about him, he's a pretty pointless card..."
