In [6]:
# import relevant packages
import pandas as pd
import csv
import re
import fasttext
import emoji

from html import unescape
from sklearn.model_selection import train_test_split

## Load Unlabelled Gab Corpus

In [7]:
%%time

# load texts from unlabelled corpus into set

# initialise empty lists --> faster than appending to dict
texts = []
dates = []

# initialise counter var for counting iterations
counter = 0

sample_freq = 10 # sample every n-th post with n = sample_freq
print_freq = 1000000 # print progress every n posts with n = print_freq

# iterate over each line
with open('../0_data/raw/gabposts_clean_170221.csv', 'r') as read_obj:
    csv_dict_reader = csv.DictReader(x.replace('\0', '') for x in read_obj)
    for row in csv_dict_reader:
        if counter % sample_freq == 0:
            texts.append(row['text'])
            dates.append(row['created_at'])
        counter+=1
        if counter % print_freq == 0:
            print(counter)

# create dataframe from lists
texts = pd.Series(texts, name = 'text')
dates = pd.Series(dates, name = 'created_at')
sample_df = pd.concat([texts, dates], axis=1)

# clear out RAM
del texts
del dates

# convert dtypes
sample_df['created_at']= sample_df.created_at.astype('datetime64')
sample_df['text']= sample_df.text.astype('string')

# print finished df
sample_df

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
CPU times: user 3min 1s, sys: 7.42 s, total: 3min 8s
Wall time: 3min 11s


Unnamed: 0,text,created_at
0,Hello world!,2016-08-10 06:58:37
1,"Welcome to Gab, @Vince",2016-08-10 17:23:03
2,Testing a link: http://regated.com/2016/08/fat...,2016-08-10 22:58:35
3,just setting up my gabr.,2016-08-11 04:16:16
4,Death to Islam!,2016-08-11 17:20:52
...,...,...
3423321,Hmmm. That does indeed seem super sketchy. No ...,2018-10-29 03:00:02
3423322,So all 3 were in line for the Rabbi? Did they ...,2018-10-29 03:00:15
3423323,ela apoiou o Doria e o Major Olimpo foi com o ...,2018-10-29 03:00:21
3423324,Military people understand gun safety and the ...,2018-10-29 03:00:35


## Perform Additional Text Cleaning

In [12]:
# create copy of sample_df to avoid having to reload sample_df
text_df = sample_df.copy()

In [13]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    return text

# create clean_text column
text_df['clean_text'] = text_df.text.apply(clean)


# save number of documents before dropping empty posts
n_docs = text_df.shape[0]

# drop rows with empty text
text_df = text_df[text_df.clean_text!=""]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for empty string content')
print(f'{text_df.shape[0]} posts remain. \n')


# save number of documents before dropping posts that are just [URL], [EMOJI] or [USER]
n_docs = text_df.shape[0]

# drop rows with text that is just [URL], [EMOJI] or [USER]
text_df = text_df[text_df.clean_text!="[URL]"]
text_df = text_df[text_df.clean_text!="[EMOJI]"]
text_df = text_df[text_df.clean_text!="[USER]"]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for being just [URL], [EMOJI] or [USER]')
print(f'{text_df.shape[0]} posts remain. \n')

3423326 posts, of which 139269 were dropped for empty string content
3284057 posts remain. 

3284057 posts, of which 284647 were dropped for empty string content
2999410 posts remain. 

CPU times: user 1min 11s, sys: 1.24 s, total: 1min 12s
Wall time: 1min 13s


In [14]:
%%time

# check language
fmodel = fasttext.load_model('../0_models/lang_detect/lid.176.bin')

def check_language(text):
    predictions = fmodel.predict(text, k=3)
    
    # if top prediction is certain and not English, return non-English
    if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
        return 'non-English'
    
    # else if English is one of top 3 predictions, return English
    elif '__label__en' in predictions[0]:
        return 'English'
    
    # else return non-English
    else:
        return 'non-English'

# save number of documents before dropping non-English posts
n_docs = text_df.shape[0]

# drop non-English posts
text_df = text_df[text_df.text.apply(lambda x: check_language(x) == 'English')]

print(f'{n_docs} posts, of which {n_docs - text_df.shape[0]} were dropped for (most likely) not being in English.')
print(f'{text_df.shape[0]} posts remain. \n')



2999410 posts, of which 205802 were dropped for (most likely) not being in English.
2793608 posts remain. 

CPU times: user 2min 3s, sys: 2 s, total: 2min 5s
Wall time: 2min 6s


## Write to Text File

In [15]:
TRAIN_SIZE = 1000000
TEST_SIZE = 10000

def write_to_txt(text_series, train_size, test_size):
    export_train, export_eval = train_test_split(text_series, train_size = train_size, test_size = test_size)

    with open('../0_data/clean/train.txt', 'w') as write_obj:
        for text in export_train:
            write_obj.write(text + "\n \n")

    with open('../0_data/clean/eval.txt', 'w') as write_obj:
        for text in export_eval:
            write_obj.write(text + "\n \n")

write_to_txt(text_df.clean_text, TRAIN_SIZE, TEST_SIZE)