In [28]:
# import relevant packages
import pandas as pd
import csv
import re
import fasttext
import emoji

from html import unescape
from sklearn.model_selection import train_test_split

## Load Unlabelled Gab Corpus

In [12]:
%%time

# load texts from unlabelled corpus into set

# initialise empty lists --> faster than appending to dict
texts = []
dates = []

# initialise counter var for counting iterations
counter = 0

sample_freq = 10000 # sample every n-th post with n = sample_freq
print_freq = 1000000 # print progress every n posts with n = print_freq
max_counter = 1000000

# iterate over each line
with open('../0_data/gabposts_clean_170221.csv', 'r') as read_obj:
    csv_dict_reader = csv.DictReader(x.replace('\0', '') for x in read_obj)
    for row in csv_dict_reader:
        if counter % sample_freq == 0:
            texts.append(row['text'])
            dates.append(row['created_at'])
        counter+=1
        if counter % print_freq == 0:
            print(counter)

# create dataframe from lists
texts = pd.Series(texts, name = 'text')
dates = pd.Series(dates, name = 'created_at')
sample_df = pd.concat([texts, dates], axis=1)

# clear out RAM
del texts
del dates

# convert dtypes
sample_df['created_at']= sample_df.created_at.astype('datetime64')
sample_df['text']= sample_df.text.astype('string')

# print finished df
sample_df

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
CPU times: user 2min 58s, sys: 5.28 s, total: 3min 4s
Wall time: 3min 8s


Unnamed: 0,text,created_at
0,Hello world!,2016-08-10 06:58:37
1,@a or @e: do you have a stays page with number...,2016-08-21 22:58:34
2,#100followers is this in reference to Lord Kek...,2016-08-24 03:19:39
3,#BoycottTarget is costing more than anyone exp...,2016-08-25 09:58:27
4,All those white tears mugs are filled with thi...,2016-08-26 18:53:31
...,...,...
3419,Antwort: TERROR!,2018-10-28 15:36:00
3420,Fraude fake - Cuidado! 17 voto nulo. Bolsonaro...,2018-10-28 18:24:57
3421,Wrong as usual little Kiwi. You monkeys lose a...,2018-10-28 21:11:21
3422,https://www.youtube.com/watch?v=SGWizajL7tA,2018-10-28 23:39:25


## Perform Additional Text Cleaning

In [88]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    return text

# create clean_text column
sample_df['clean_text'] = sample_df.text.apply(clean)


# save number of documents before dropping empty posts
n_docs = sample_df.shape[0]

# drop rows with empty text
sample_df = sample_df[sample_df.clean_text!=""]

print(f'{n_docs} posts, of which {n_docs - sample_df.shape[0]} were dropped for empty string content')
print(f'{sample_df.shape[0]} posts remain. \n')

3281 posts, of which 0 were dropped for empty string content
3281 posts remain. 

CPU times: user 77.1 ms, sys: 2.15 ms, total: 79.3 ms
Wall time: 78.1 ms


In [91]:
# create copy of sample_df to avoid having to reload sample_df
text_df = sample_df.copy()

In [92]:
%%time

# check language
fmodel = fasttext.load_model('../0_models/lang_detect/lid.176.bin')

def check_language(text):
    predictions = fmodel.predict(text, k=3)
    
    # if top prediction is certain and not English, return non-English
    if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
        return 'non-English'
    
    # else if English is one of top 3 predictions, return English
    elif '__label__en' in predictions[0]:
        return 'English'
    
    # else return non-English
    else:
        return 'non-English'

# save number of documents before dropping non-English posts
n_docs = text_df.shape[0]

# drop non-English posts
text_df = text_df[text_df.text.apply(lambda x: check_language(x) == 'English')]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for (most likely) not being in English.')
print(f'{text_df.shape[0]} posts remain. \n')

3281 posts, of which 268 were dropped for (most likely) not being in English.
3013 posts remain. 

CPU times: user 319 ms, sys: 199 ms, total: 518 ms
Wall time: 632 ms




## Write to Text File

In [27]:
TRAIN_SIZE = 100
TEST_SIZE = 100

def write_to_txt(text_series, train_size, test_size):
    export_train, export_eval = train_test_split(text_series, train_size = train_size, test_size = test_size)

    with open('../0_data/clean/train.txt', 'w') as write_obj:
        for text in export_train:
            write_obj.write(text + "\n \n")

    with open('../0_data/clean/eval.txt', 'w') as write_obj:
        for text in export_eval:
            write_obj.write(text + "\n \n")

write_to_txt(text_df.clean_text, TRAIN_SIZE, TEST_SIZE)