# Clean and Split Monthly Unlabelled Reddit (News) Corpora

In [1]:
import datetime as dt
import csv
import re
import fasttext
import emoji
import pandas as pd
import unicodedata
import os

from html import unescape
from sklearn.model_selection import train_test_split

# Helper functions

In [2]:
def load_csv_to_df(filepath):

    with open(os.path.join(directory, filename),'r', newline='') as f:
        reader = csv.reader(f)
        cols = [next(reader)]
        df = pd.DataFrame(line for line in reader)
        df.columns = cols
        df.columns = df.columns.get_level_values(0)
        
    # small number of csv formatting errors --> delete

    # save number of documents
    n_docs = df.shape[0]

    # drop posts with formatting errors
    df.dropna(inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for csv formatting errors.')
    print(f'{df.shape[0]} posts remain. \n')
    
    # convert epoch to readable datetime
    df.created_utc = df.created_utc.apply(lambda x: dt.datetime.fromtimestamp(int(x)))
    
    return df

In [3]:
def clean_text(text, max_length = 1024):

    # convert html
    text = unescape(text)
    
    # clean unicode formatting errors
    text = unicodedata.normalize("NFKD", text)
    text = text.replace('\u200d', '')
    
    # truncate text to max_length
    text = text[:max_length]
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # replace URLs and emojis with special tokens
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # remove deleted posts
    text = text.replace('[deleted]','')
    text = text.replace('[removed]','')
    
    # remove leading ">" (reddit artifact)
    text = text.lstrip('>')
    
    # collapse whitespace into single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # truncate text to max_length again
    text = text[:max_length]
    
    # remove leading and trailing whitespaces
    text = text.strip()

    return text

In [4]:
def drop_empty(df):
    
    # save number of documents before dropping empty posts
    n_docs = df.shape[0]

    # drop rows with empty text
    df = df[df.clean_text.values!=""]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for empty string content')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [5]:
def drop_url_emoji(df):

    # save number of documents before dropping posts that are just [URL] or [EMOJI]
    n_docs = df.shape[0]

    # drop rows with text that is just [URL] or [EMOJI]
    df = df[(df.clean_text!="[URL]") & (df.clean_text!="[EMOJI]")]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being just [URL] or [EMOJI]')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [6]:
def drop_non_english(df):
    
    # load language classifier
    fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

    def check_language(text):
        predictions = fmodel.predict(text, k=3)

        # if top prediction is certain and not English, return non-English
        if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
            return 'non-English'

        # else if English is one of top 3 predictions, return English
        elif '__label__en' in predictions[0]:
            return 'English'

        # else return non-English
        else:
            return 'non-English'

    # save number of documents before dropping non-English posts
    n_docs = df.shape[0]

    # drop non-English posts
    df = df[df.clean_text.apply(lambda x: check_language(x) == 'English')]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for (most likely) not being in English.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [7]:
def drop_dupl(df):

    # save number of documents before dropping duplicates
    n_docs = df.shape[0]

    # drop duplicates
    df.drop_duplicates(subset = ['clean_text'], inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being duplicates.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [8]:
# create train and test splits for each month-year
    
def split_export(df, filename):
    
    export_train = pd.Series(dtype=str)
    export_test = export_train.copy()
    
    for subreddit in pd.unique(df.subreddit):
        
        add_train, add_test = train_test_split(df[df.subreddit==subreddit].clean_text,
                                                     train_size = 500000, test_size = 5000,
                                                     random_state = 123)
        
        export_train = export_train.append(add_train)
        export_test = export_test.append(add_test)
        
    
    with open(f'../../0_data/clean/unlabelled_reddit/train_{filename[5:-4]}_1m.txt', 'w') as write_obj:
        for text in export_train.sample(frac=1):
            write_obj.write(text + "\n \n")
            
    with open(f'../../0_data/clean/unlabelled_reddit/test_{filename[5:-4]}_10k.txt', 'w') as write_obj:
        for text in export_test.sample(frac=1):
            write_obj.write(text + "\n \n")

In [22]:
# create train and test splits for each month-year
# for months where there are more than 1m comments in total but less than 500k in one of the two subreddits
# the downstream effects of this slight imbalance should be negligible
    
def split_export_unbalanced(df, filename):
    
    export_train, export_test = train_test_split(df.clean_text,
                                                 train_size = 1000000, test_size = 10000,
                                                 random_state = 123)
    
    with open(f'../../0_data/clean/unlabelled_reddit/train_{filename[5:-4]}_1m.txt', 'w') as write_obj:
        for text in export_train.sample(frac=1):
            write_obj.write(text + "\n \n")
            
    with open(f'../../0_data/clean/unlabelled_reddit/test_{filename[5:-4]}_10k.txt', 'w') as write_obj:
        for text in export_test.sample(frac=1):
            write_obj.write(text + "\n \n")

# Main function

In [26]:
def clean_split(directory, filename):
    
    filepath = os.path.join(directory, filename)
    
    # load df from csv
    df = load_csv_to_df(filepath)
    
    # clean text ("body") and write to column
    df['clean_text'] = df.body.apply(clean_text)
    
    # drop posts with empty string 
    df = drop_empty(df)
    
    # drop posts that are just [URL] or [EMOJI]
    df = drop_url_emoji(df)
    
    # drop posts that are likely not English
    df = drop_non_english(df)
    
    # drop duplicates
    df = drop_dupl(df)
    
    # export to train and test file
    split_export_unbalanced(df, filename)
    
    return f'wrote cleaned train and test file from {filename}'

In [29]:
%%time

# load raw data from csvs, clean it and split it into train and test sets

directory = '../../0_data/raw/unlabelled_reddit'

for filename in ["news_2019_02.csv", "news_2019_04.csv", "news_2019_05.csv", "news_2019_06.csv", "news_2019_11.csv", "news_2019_12.csv"]: #os.listdir(directory):
    if filename.endswith(".csv"):
        print(filename.upper(), '\n')
        try:
            clean_split(directory, filename)
        except:
            print('not enough data')
            continue
        continue
    else:
        continue

NEWS_2019_02.CSV 

1347478 posts, of which 0 were dropped for csv formatting errors.
1347478 posts remain. 

1347478 posts, of which 196135 were dropped for empty string content
1151343 posts remain. 

1151343 posts, of which 2542 were dropped for being just [URL] or [EMOJI]
1148801 posts remain. 





1148801 posts, of which 7534 were dropped for (most likely) not being in English.
1141267 posts remain. 

1141267 posts, of which 24215 were dropped for being duplicates.
1117052 posts remain. 

NEWS_2019_04.CSV 

1487245 posts, of which 0 were dropped for csv formatting errors.
1487245 posts remain. 

1487245 posts, of which 221194 were dropped for empty string content
1266051 posts remain. 

1266051 posts, of which 2701 were dropped for being just [URL] or [EMOJI]
1263350 posts remain. 





1263350 posts, of which 8042 were dropped for (most likely) not being in English.
1255308 posts remain. 

1255308 posts, of which 26513 were dropped for being duplicates.
1228795 posts remain. 

NEWS_2019_05.CSV 

1380123 posts, of which 0 were dropped for csv formatting errors.
1380123 posts remain. 

1380123 posts, of which 194131 were dropped for empty string content
1185992 posts remain. 

1185992 posts, of which 2415 were dropped for being just [URL] or [EMOJI]
1183577 posts remain. 





1183577 posts, of which 7328 were dropped for (most likely) not being in English.
1176249 posts remain. 

1176249 posts, of which 27958 were dropped for being duplicates.
1148291 posts remain. 

NEWS_2019_06.CSV 

1469294 posts, of which 2 were dropped for csv formatting errors.
1469292 posts remain. 

1469292 posts, of which 224535 were dropped for empty string content
1244757 posts remain. 

1244757 posts, of which 2716 were dropped for being just [URL] or [EMOJI]
1242041 posts remain. 





1242041 posts, of which 7961 were dropped for (most likely) not being in English.
1234080 posts remain. 

1234080 posts, of which 23497 were dropped for being duplicates.
1210583 posts remain. 

NEWS_2019_11.CSV 

1447417 posts, of which 0 were dropped for csv formatting errors.
1447417 posts remain. 

1447417 posts, of which 220980 were dropped for empty string content
1226437 posts remain. 

1226437 posts, of which 2616 were dropped for being just [URL] or [EMOJI]
1223821 posts remain. 





1223821 posts, of which 8201 were dropped for (most likely) not being in English.
1215620 posts remain. 

1215620 posts, of which 24729 were dropped for being duplicates.
1190891 posts remain. 

NEWS_2019_12.CSV 

1613646 posts, of which 3 were dropped for csv formatting errors.
1613643 posts remain. 

1613643 posts, of which 242849 were dropped for empty string content
1370794 posts remain. 

1370794 posts, of which 2554 were dropped for being just [URL] or [EMOJI]
1368240 posts remain. 





1368240 posts, of which 8485 were dropped for (most likely) not being in English.
1359755 posts remain. 

1359755 posts, of which 27331 were dropped for being duplicates.
1332424 posts remain. 

CPU times: user 13min 32s, sys: 26.7 s, total: 13min 59s
Wall time: 14min 9s
