In [1]:
# import relevant packages
import pandas as pd
import csv
import re
import fasttext
import emoji
import unicodedata

from html import unescape
from sklearn.model_selection import train_test_split

## Load Unlabelled Gab Corpus

In [3]:
%%time

# load texts from unlabelled corpus into set

# initialise empty lists --> faster than appending to dict
texts = []
dates = []

# initialise counter var for counting iterations
counter = 0

sample_freq = 2 # sample every n-th post with n = sample_freq
print_freq = 1000000 # print progress every n posts with n = print_freq

# iterate over each line
with open('../../0_data/raw/gabposts_clean_170221.csv', 'r') as read_obj:
    csv_dict_reader = csv.DictReader(x.replace('\0', '') for x in read_obj)
    for row in csv_dict_reader:
        if counter % sample_freq == 0:
            texts.append(row['text'])
            dates.append(row['created_at'])
        counter+=1
        if counter % print_freq == 0:
            print(counter)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000


NameError: name 'clean_texts' is not defined

In [7]:
%%time

# create dataframe from lists
texts = pd.Series(texts, name = 'text')
dates = pd.Series(dates, name = 'created_at')
sample_df = pd.concat([texts, dates], axis=1)

# clear out RAM
del texts
del dates

# convert dtypes
sample_df['created_at']= sample_df.created_at.astype('datetime64')
sample_df['text']= sample_df.text.astype('string')

# print finished df
sample_df

Unnamed: 0,text,created_at
0,Hello world!,2016-08-10 06:58:37
1,test post for repost,2016-08-10 07:26:18
2,All censorships exist to prevent anyone from c...,2016-08-10 07:59:06
3,H E L L O,2016-08-10 11:03:39
4,Gab is to speak as ________ is to ________ #fr...,2016-08-10 11:44:54
...,...,...
17116624,I guess you will have to go to jail. If that i...,2018-10-29 03:00:43
17116625,https://www.youtube.com/watch?v=bMK0MIwWzHI Re...,2018-10-29 03:00:45
17116626,"Thank you, been trolling these cunts all day.",2018-10-29 03:01:45
17116627,Very true.. #BigPharma #corruption,2018-10-29 03:02:17


## Perform Additional Text Cleaning

In [8]:
# create copy of sample_df to avoid having to reload sample_df
text_df = sample_df.copy()

In [9]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # clean misformatting (e.g. "\xa0")
    text = unicodedata.normalize("NFKD", text)

    return text

# create clean_text column
text_df['clean_text'] = text_df.text.apply(clean)

CPU times: user 6min 18s, sys: 1min 8s, total: 7min 27s
Wall time: 9min 9s


In [10]:
%%time 

# save number of documents before dropping empty posts
n_docs = text_df.shape[0]

# drop rows with empty text
text_df = text_df[text_df.clean_text.values!=""]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for empty string content')
print(f'{text_df.shape[0]} posts remain. \n')

17116629 posts, of which 695435 were dropped for empty string content
16421194 posts remain. 

CPU times: user 11.4 s, sys: 1min 13s, total: 1min 24s
Wall time: 4min 4s


In [11]:
%%time

# save number of documents before dropping posts that are just [URL], [EMOJI] or [USER]
n_docs = text_df.shape[0]

# drop rows with text that is just [URL], [EMOJI] or [USER]
text_df = text_df[(text_df.clean_text!="[URL]") & (text_df.clean_text!="[EMOJI]") & (text_df.clean_text!="[USER]")]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for being just [URL], [EMOJI] or [USER]')
print(f'{text_df.shape[0]} posts remain. \n')

16421194 posts, of which 1424791 were dropped for being just [URL], [EMOJI] or [USER]
14996403 posts remain. 

CPU times: user 17 s, sys: 1min 23s, total: 1min 40s
Wall time: 4min 16s


In [12]:
%%time

# check language
fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

def check_language(text):
    predictions = fmodel.predict(text, k=3)
    
    # if top prediction is certain and not English, return non-English
    if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
        return 'non-English'
    
    # else if English is one of top 3 predictions, return English
    elif '__label__en' in predictions[0]:
        return 'English'
    
    # else return non-English
    else:
        return 'non-English'

# save number of documents before dropping non-English posts
n_docs = text_df.shape[0]

# drop non-English posts
text_df = text_df[text_df.text.apply(lambda x: check_language(x) == 'English')]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for (most likely) not being in English.')
print(f'{text_df.shape[0]} posts remain. \n')



14996403 posts, of which 1028437 were dropped for (most likely) not being in English.
13967966 posts remain. 

CPU times: user 11min 10s, sys: 1min 57s, total: 13min 7s
Wall time: 17min 4s


## Write to Text File

In [17]:
%%time

TRAIN_SIZES = [1000000, 2000000, 5000000, 10000000]
TEST_SIZE = 50000

export_train_base, export_eval = train_test_split(text_df.clean_text, test_size = TEST_SIZE, random_state = 123)

with open(f'../../0_data/clean/unlabelled_pushshift/eval_rand_{int(TEST_SIZE/1000)}k.txt', 'w') as write_obj:
    for text in export_eval:
        write_obj.write(text + "\n \n")

for size in TRAIN_SIZES:
    export_train, _ = train_test_split(export_train_base, train_size = size, random_state = 123)

    with open(f'../../0_data/clean/unlabelled_pushshift/train_rand_{int(size/1000000)}m.txt', 'w') as write_obj:
        for text in export_train:
            write_obj.write(text + "\n \n")


CPU times: user 55.6 s, sys: 1min 17s, total: 2min 13s
Wall time: 2min 56s
