Convert all datasets to a standard 2-columns format ('text' and 'label')

In [1]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups
from util.dataloader import DataLoader

In [2]:
dl = DataLoader()

In [3]:
%%time
data_dict = dl.load()

100%|██████████████████████████████████████████████████████████████████████████████| 624/624 [00:00<00:00, 1278.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 432/432 [00:00<00:00, 1376.49it/s]


  """Entry point for launching an IPython kernel.
b'Skipping line 1296: expected 1 fields, saw 21\nSkipping line 1754: expected 1 fields, saw 5\nSkipping line 2774: expected 1 fields, saw 5\n'


In [5]:
data_dict['liar']['train']

Unnamed: 0,label,text
0,0,Says the Annies List political group supports ...
1,1,When did the decline of coal start? It started...
2,2,"Hillary Clinton agrees with John McCain ""by vo..."
3,0,Health care reform legislation is likely to ma...
4,1,The economic turnaround started at the end of ...
...,...,...
10264,2,There are a larger number of shark attacks in ...
10265,2,Democrats have now become the party of the [At...
10266,1,Says an alternative to Social Security that op...
10267,0,On lifting the U.S. Cuban embargo and allowing...


In [6]:
data_dict['sarc']['train']

Unnamed: 0,label,text
0,0,"Dang dog, thanks"
1,0,to summon the powers of the flying spaghetti m...
2,0,i did that 3rd last 1 by accident last night
3,0,"He's insane, used him in DC, better than Blake..."
4,0,"Forgot about him, he's a pretty pointless card..."
...,...,...
1125673,1,So *that's* why I can point my finger and have...
1125674,1,"If the IDF said it, it must be true."
1125675,1,That's why you have to read widely from Syria ...
1125676,1,"Yeah, the Palestinian rocket actually left som..."


## 1. Fake News datasets

In [None]:
os.chdir('datasets/fake_news')

In [None]:
#Fake and Real News dataset
#TO DO 
#   - keep the title or just the text content ?
real_news = pd.read_csv('fake_and_real_news/True.csv',usecols = ['title','text'])
fake_news = pd.read_csv('fake_and_real_news/Fake.csv',usecols = ['title','text'])

real_news['label'] = np.full(shape=(real_news.shape[0],1),fill_value=1)
fake_news['label'] = np.zeros(shape=(fake_news.shape[0],1))

fake_real_news = pd.concat([real_news,fake_news])

In [None]:
fake_real_news.head() #Train Test Split needs to be done manually (guidelines in the paper?)

In [None]:
#FakeNewsNet
# 2 datasets : PolitiFacts (small) and Gossipcop (large)
# Politifacts

def retrieve_text(path):
    text = []
    no_json_count = 0
    for folder in tqdm(os.listdir(path)):
        if os.path.exists(path+folder+"/news content.json"):
            with open(path+folder+"/news content.json")  as f:
                text.append(json.load(f)['text'])
        else:
            no_json_count +=1
            text.append(' ')
    print("No json file for %s folders"%no_json_count)
    return text


real_path = "FakeNewsNet/code/fakenewsnet_dataset/politifact/real/"
fake_path = "FakeNewsNet/code/fakenewsnet_dataset/politifact/fake/"
real_text = retrieve_text(real_path)
fake_text = retrieve_text(fake_path)
print(' ')
print('There are %s real texts'%len(real_text))
print('There are %s fake texts'%len(fake_text))
label = [1] * len(real_text) + [0] * len(fake_text) 
politifact = pd.DataFrame({'text':real_text+fake_text,'label':label})

In [None]:
politifact.head()

In [None]:
politifact['text'].value_counts()[0:2]
#TO DO 
#Understand why there are 101 news folder without a json file at all
#Understand why there are 143 files without text while the paper says its 108

In [None]:
#LIAR
liar_train = pd.read_csv('liar/train.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})
liar_val = pd.read_csv('liar/val.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})
liar_test = pd.read_csv('liar/test.csv',
                         usecols=['label','statement']).rename(columns={'statement':'text'})

In [None]:
liar_train.head()

## 2. Topic Modelling datasets

In [None]:
os.chdir('../topic')

In [None]:
#20Newsgroup  
#https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset
#Files are downloaded once on the computer, after that they are loaded much faster
twentynews_train = fetch_20newsgroups(subset='train',
                           remove=('headers', 'footers', 'quotes') #option to remove metadata 
                          )
twentynews_test = fetch_20newsgroups(subset='test',
                           remove=('headers', 'footers', 'quotes') #option to remove metadata  
                          )    

In [None]:
twentynews_train_df = pd.DataFrame({'label':twentynews_train['target'],
                                     'text':twentynews_train['data']})
twentynews_test_df = pd.DataFrame({'label':twentynews_test['target'],
                                     'text':twentynews_test['data']})                                   

In [None]:
twentynews_train_df.head()

In [None]:
#AG News
agnews_train = pd.read_csv('ag_news/train.csv')
agnews_test = pd.read_csv('ag_news/test.csv')

In [None]:
agnews_train.head()

In [None]:
#Yahoo answers
#Text consists of the question title, question content and best answer. which elements should we keep?
#TO DO : merge the text
col_dict = {0:'label',1:'title',2:'question',3:'answer'}
yahoo_train = pd.read_csv('yahoo_answers/train.csv',
                          header=None).rename(columns=col_dict)
yahoo_test = pd.read_csv('yahoo_answers/test.csv',
                          header=None).rename(columns=col_dict)

In [None]:
yahoo_train.head()

## 3. Sentiments analysis datasets

### 3.1. Emotion

In [None]:
os.chdir('../sentiment/emotion')

In [None]:
#Tweet Eval : Emotion dection
#Labels and text  are in separate  text files
eval_emotion_train = pd.DataFrame()
eval_emotion_val = pd.DataFrame()
eval_emotion_test = pd.DataFrame()
eval_emotion_train['label'] = pd.read_table('tweetEval/datasets/train_labels.txt',header=None)
eval_emotion_val['label'] = pd.read_table('tweetEval/datasets/val_labels.txt',header=None)
eval_emotion_test['label'] = pd.read_table('tweetEval/datasets/test_labels.txt',header=None)
eval_emotion_train['text']= pd.read_table('tweetEval/datasets/train_text.txt',header=None)
eval_emotion_val['text']= pd.read_table('tweetEval/datasets/val_text.txt',header=None)
eval_emotion_test['text'] = pd.read_table('tweetEval/datasets/test_text.txt',header=None)

In [None]:
eval_emotion_train.head()

In [None]:
#CARER Emotion
carer_train = pd.read_csv('CARER/train.csv')
carer_val = pd.read_csv('CARER/val.csv')
carer_test = pd.read_csv('CARER/test.csv')

In [None]:
carer_train.head()

In [None]:
#silicone (Daily Dialog Act)
silicone_train = pd.read_csv('silicone/train.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})
siliconer_val = pd.read_csv('silicone/val.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})
silicone_test = pd.read_csv('silicone/test.csv',
                             usecols=['Utterance','Label']).rename(columns={'Utterance':'text','Label':'label'})

In [None]:
silicone_train.head()

In [None]:
#IMDb 
imdb_train = pd.read_csv('IMDb/train.csv')
imdb_test = pd.read_csv('IMDb/test.csv')

In [None]:
imdb_train.head()

### 3.2. Polarity

In [None]:
os.chdir('../polarity')

In [None]:
#YELP
yelp_train = pd.read_csv('yelp/train.csv')
yelp_test = pd.read_csv('yelp/test.csv')

In [None]:
yelp_train.head()

In [None]:
#sst2 
sst2_train = pd.read_csv('sst2/train.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})
sst2_val = pd.read_csv('sst2/val.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})
sst2_test = pd.read_csv('sst2/test.csv',usecols=['sentence','label']).rename(columns={'sentence':'text'})

In [None]:
sst2_train.head()

###  3.3. Sarcasm

In [None]:
os.chdir('../sarcasm')

In [11]:
#Tweet Eval : Irony
#Parser Error with the train text file, the parsers skips three lines as a result. Further investigation needed
eval_irony_train = pd.DataFrame()
eval_irony_test = pd.DataFrame()
eval_irony_train['label'] = pd.read_table('tweetEval/train_labels.txt',header=None)
eval_irony_test['label'] = pd.read_table('tweetEval/test_labels.txt',header=None)
eval_irony_train['text']= pd.read_table('tweetEval/train_text.txt',header=None,error_bad_lines=False)
eval_irony_test['text'] = pd.read_table('tweetEval/test_text.txt',header=None)

b'Skipping line 1296: expected 1 fields, saw 21\nSkipping line 1754: expected 1 fields, saw 5\nSkipping line 2774: expected 1 fields, saw 5\n'


In [None]:
eval_irony_train.head() 

In [6]:
#SemEval 2018
#Task A Binary, Task B Multiclass (4)
semeval_train_taskA = pd.read_table('SemEval/datasets/train/SemEval2018-T3-train-taskA.txt',
                                    usecols=['Label','Tweet text']).rename(columns = {'Label':'label','Tweet text':'text'})
semeval_train_taskB = pd.read_table('SemEval/datasets/train/SemEval2018-T3-train-taskB.txt',
                                    usecols=['Label','Tweet text']).rename(columns = {'Label':'label','Tweet text':'text'})
semeval_test_taskA = pd.DataFrame()
semeval_test_taskB = pd.DataFrame()
semeval_test_taskA['text'] = pd.read_table('SemEval/datasets/test_TaskA/SemEval2018-T3_input_test_taskA.txt',
                                    usecols=['tweet text'])
semeval_test_taskB['text'] = pd.read_table('SemEval/datasets/test_TaskB/SemEval2018-T3_input_test_taskB.txt',
                                    usecols=['tweet text'])
semeval_test_taskA['label'] = pd.read_table('SemEval/datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt',
                                    usecols=['Label'])
semeval_test_taskB['label'] = pd.read_table('SemEval/datasets/goldtest_TaskB/SemEval2018-T3_gold_test_taskB_emoji.txt',
                                    usecols=['Label'])

In [7]:
semeval_train_taskA.head()

Unnamed: 0,label,text
0,1,Sweet United Nations video. Just in time for C...
1,1,@mrdahl87 We are rumored to have talked to Erv...
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,1,I can't breathe! was chosen as the most notabl...


In [8]:
semeval_train_taskB.head()

Unnamed: 0,label,text
0,1,Sweet United Nations video. Just in time for C...
1,1,@mrdahl87 We are rumored to have talked to Erv...
2,1,Hey there! Nice to see you Minnesota/ND Winter...
3,0,3 episodes left I'm dying over here
4,2,I can't breathe! was chosen as the most notabl...


In [4]:
#SARC V1.0
#balanced version. There is also a much larger unbalanced version (several GBy)
sarc_train = pd.read_csv('SARC/train-balanced.csv',
                    sep='\t',
                    header=None,
                    usecols=[0,1]).rename(columns={0:'label',1:'text'})
sarc_test = pd.read_csv('SARC/test-balanced.csv',
                    sep='\t',
                    header=None,
                    usecols=[0,1]).rename(columns={0:'label',1:'text'})

In [5]:
sarc_train.head()

Unnamed: 0,label,text
0,0,"Dang dog, thanks"
1,0,to summon the powers of the flying spaghetti m...
2,0,i did that 3rd last 1 by accident last night
3,0,"He's insane, used him in DC, better than Blake..."
4,0,"Forgot about him, he's a pretty pointless card..."
