In [1]:
# import relevant packages
import pandas as pd
import csv
import re
import fasttext
import emoji
import unicodedata

from html import unescape
from sklearn.model_selection import train_test_split

## Load Unlabelled Gab Corpus

In [2]:
%%time

# load 2018 texts from unlabelled corpus into set

# initialise empty lists --> faster than appending to dict
texts = []
dates = []

# initialise counter var for counting iterations
counter = 0
sample_freq = 1

print_freq = 1000000 # print progress every n posts with n = print_freq

# iterate over each line
with open('../../0_data/raw/gabposts_clean_170221.csv', 'r') as read_obj:
    csv_dict_reader = csv.DictReader(x.replace('\0', '') for x in read_obj)
    for row in csv_dict_reader:
        if (row['created_at'] >= '2018') and (counter % sample_freq) == 0:
            texts.append(row['text'])
            dates.append(row['created_at'])
        counter+=1
        if counter % print_freq == 0:
            print(counter)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
CPU times: user 3min 1s, sys: 6.85 s, total: 3min 8s
Wall time: 3min 10s


In [3]:
%%time

# create dataframe from lists
texts = pd.Series(texts, name = 'text')
dates = pd.Series(dates, name = 'created_at')
sample_df = pd.concat([texts, dates], axis=1)

# clear out RAM
del texts
del dates

# convert dtypes
sample_df['created_at']= sample_df.created_at.astype('datetime64')
sample_df['text']= sample_df.text.astype('string')

# print finished df
sample_df

CPU times: user 33.7 s, sys: 2min 52s, total: 3min 26s
Wall time: 4min 23s


Unnamed: 0,text,created_at
0,watching American wierwolf in London hahah bet...,2018-01-01 00:00:01
1,Happy New Year everybody https://hooktube.com/...,2018-01-01 00:00:02
2,Praying! Thank God she's alive.,2018-01-01 00:00:04
3,,2018-01-01 00:00:04
4,Trump Vindicated Again! https://www.youtube.co...,2018-01-01 00:00:04
...,...,...
20360810,"Just one,'' I'm better than you'' idiots opini...",2018-10-29 03:01:47
20360811,Very true.. #BigPharma #corruption,2018-10-29 03:02:17
20360812,"""Finkelstein recently studied millions of comm...",2018-10-29 03:03:58
20360813,https://thedailycoin.org/2018/10/28/movie-prop...,2018-10-29 03:05:45


## Perform Additional Text Cleaning

In [4]:
# create copy of sample_df to avoid having to reload sample_df
text_df = sample_df.copy()

In [5]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # clean misformatting (e.g. "\xa0")
    text = unicodedata.normalize("NFKD", text)

    return text

# create clean_text column
text_df['clean_text'] = text_df.text.apply(clean)

CPU times: user 7min 18s, sys: 1min 27s, total: 8min 46s
Wall time: 11min 29s


In [6]:
%%time 

# save number of documents before dropping empty posts
n_docs = text_df.shape[0]

# drop rows with empty text
text_df = text_df[text_df.clean_text.values!=""]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for empty string content')
print(f'{text_df.shape[0]} posts remain. \n')

20360815 posts, of which 1187729 were dropped for empty string content
19173086 posts remain. 

CPU times: user 10.7 s, sys: 1min 32s, total: 1min 43s
Wall time: 4min 32s


In [7]:
%%time

# save number of documents before dropping posts that are just [URL], [EMOJI] or [USER]
n_docs = text_df.shape[0]

# drop rows with text that is just [URL], [EMOJI] or [USER]
text_df = text_df[(text_df.clean_text!="[URL]") & (text_df.clean_text!="[EMOJI]") & (text_df.clean_text!="[USER]")]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for being just [URL], [EMOJI] or [USER]')
print(f'{text_df.shape[0]} posts remain. \n')

19173086 posts, of which 1598858 were dropped for being just [URL], [EMOJI] or [USER]
17574228 posts remain. 

CPU times: user 16.1 s, sys: 1min 48s, total: 2min 5s
Wall time: 5min 46s


In [8]:
%%time

# check language
fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

def check_language(text):
    predictions = fmodel.predict(text, k=3)
    
    # if top prediction is certain and not English, return non-English
    if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
        return 'non-English'
    
    # else if English is one of top 3 predictions, return English
    elif '__label__en' in predictions[0]:
        return 'English'
    
    # else return non-English
    else:
        return 'non-English'

# save number of documents before dropping non-English posts
n_docs = text_df.shape[0]

# drop non-English posts
text_df = text_df[text_df.text.apply(lambda x: check_language(x) == 'English')]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for (most likely) not being in English.')
print(f'{text_df.shape[0]} posts remain. \n')



17574228 posts, of which 1423284 were dropped for (most likely) not being in English.
16150944 posts remain. 

CPU times: user 13min 34s, sys: 2min 28s, total: 16min 2s
Wall time: 20min 52s


## Write to Text Files

In [9]:
%%time

# create monthyear column for easier sorting
text_df['monthyear'] = text_df.created_at.apply(lambda x: x.to_period('M'))

CPU times: user 5min 14s, sys: 1min 44s, total: 6min 58s
Wall time: 8min 30s


In [10]:
%%time

## monthly train and test sets

TRAIN_SIZE = 1000000
TEST_SIZE = 10000

export_train = {}
export_test = {}

for my in pd.unique(text_df.monthyear):
    export_train[my], export_test[my] = train_test_split(text_df[text_df.monthyear==my].sample(TRAIN_SIZE+TEST_SIZE, random_state=123).clean_text,
                                                 train_size = TRAIN_SIZE, test_size = TEST_SIZE, random_state=123)
    
    with open(f'../../0_data/clean/unlabelled_pushshift/month_splits/train_{my}_{int(TRAIN_SIZE/1000000)}m.txt', 'w') as write_obj:
        for text in export_train[my]:
            write_obj.write(text + "\n \n")
            
    with open(f'../../0_data/clean/unlabelled_pushshift/month_splits/test_{my}_{int(TEST_SIZE/1000)}k.txt', 'w') as write_obj:
        for text in export_test[my]:
            write_obj.write(text + "\n \n")

CPU times: user 24.4 s, sys: 52.8 s, total: 1min 17s
Wall time: 2min 46s


In [12]:
%%time

# random total train and test sets stratified by time

TRAIN_SIZES = [1000000, 2000000, 5000000, 10000000]
TEST_SIZE = 10000

# train sets
for size in TRAIN_SIZES: 

    export_train['rand'] = pd.Series(dtype=str)
    
    for my in pd.unique(text_df.monthyear):
        export_train['rand'] = export_train['rand'].append(export_train[my].sample(int(size/10), random_state=123), ignore_index=True)
        
    with open(f'../../0_data/clean/unlabelled_pushshift/month_splits/total/train_rand_{int(size/1000000)}m.txt', 'w') as write_obj:
        for text in export_train['rand']:
            write_obj.write(text + "\n \n")
            
            
# test set

export_test['rand'] = pd.Series(dtype=str)

for my in pd.unique(text_df.monthyear):
    export_test['rand'] = export_test['rand'].append(export_test[my].sample(int(TEST_SIZE/10), random_state=123), ignore_index=True)

with open(f'../../0_data/clean/unlabelled_pushshift/month_splits/total/test_rand_{int(TEST_SIZE/1000)}k.txt', 'w') as write_obj:
    for text in export_test['rand']:
        write_obj.write(text + "\n \n")

CPU times: user 30 s, sys: 21.7 s, total: 51.7 s
Wall time: 1min 1s
