# Clean and Split Monthly Unlabelled Reddit (News) Corpora

In [2]:
import datetime as dt
import csv
import re
import fasttext
import emoji
import pandas as pd
import unicodedata
import os

from html import unescape
from sklearn.model_selection import train_test_split

# Helper functions

In [3]:
def load_csv_to_df(filepath):

    with open(os.path.join(directory, filename),'r', newline='') as f:
        reader = csv.reader(f)
        cols = [next(reader)]
        df = pd.DataFrame(line for line in reader)
        df.columns = cols
        df.columns = df.columns.get_level_values(0)
        
    # small number of csv formatting errors --> delete

    # save number of documents
    n_docs = df.shape[0]

    # drop posts with formatting errors
    df.dropna(inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for csv formatting errors.')
    print(f'{df.shape[0]} posts remain. \n')
    
    # convert epoch to readable datetime
    df.created_utc = df.created_utc.apply(lambda x: dt.datetime.fromtimestamp(int(x)))
    
    return df

In [4]:
def clean_text(text, max_length = 1024):

    # convert html
    text = unescape(text)
    
    # clean unicode formatting errors
    text = unicodedata.normalize("NFKD", text)
    text = text.replace('\u200d', '')
    
    # truncate text to max_length
    text = text[:max_length]
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # replace URLs and emojis with special tokens
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # remove deleted posts
    text = text.replace('[deleted]','')
    text = text.replace('[removed]','')
    
    # remove leading ">" (reddit artifact)
    text = text.lstrip('>')
    
    # remove The_Donald bots
    if "the Andromeda Galaxy" in text or "This bot was created" in text:
        text = ""
        
    # remove politics bots
    if "I'm a bot" in text:
        text = ""
        
    # remove chapo / libertarian bots
    if "^bot" in text or "transcribing bot" in text or "isbot " in text:
        text = ""
        
    # remove libertarian bots
    if "This bot wants to" in text:
        text = ""
    
    # collapse whitespace into single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # truncate text to max_length again
    text = text[:max_length]
    
    # remove leading and trailing whitespaces
    text = text.strip()

    return text

In [5]:
def drop_empty(df):
    
    # save number of documents before dropping empty posts
    n_docs = df.shape[0]

    # drop rows with empty text
    df = df[df.clean_text.values!=""]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for empty string content')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [6]:
def drop_url_emoji(df):

    # save number of documents before dropping posts that are just [URL] or [EMOJI]
    n_docs = df.shape[0]

    # drop rows with text that is just [URL] or [EMOJI]
    df = df[(df.clean_text!="[URL]") & (df.clean_text!="[EMOJI]")]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being just [URL] or [EMOJI]')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [7]:
def drop_non_english(df):
    
    # load language classifier
    fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

    def check_language(text):
        predictions = fmodel.predict(text, k=3)

        # if top prediction is certain and not English, return non-English
        if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
            return 'non-English'

        # else if English is one of top 3 predictions, return English
        elif '__label__en' in predictions[0]:
            return 'English'

        # else return non-English
        else:
            return 'non-English'

    # save number of documents before dropping non-English posts
    n_docs = df.shape[0]

    # drop non-English posts
    df = df[df.clean_text.apply(lambda x: check_language(x) == 'English')]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for (most likely) not being in English.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [8]:
def drop_dupl(df):

    # save number of documents before dropping duplicates
    n_docs = df.shape[0]

    # drop duplicates
    df.drop_duplicates(subset = ['clean_text'], inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being duplicates.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [9]:
# create train and test splits for each month-year
    
def split_export(df, filename):
    
    export_train = pd.DataFrame(columns=['clean_text', 'subreddit'])
    export_test = export_train.copy()
    
    for subreddit in pd.unique(df.subreddit):
        
        add_train, add_test = train_test_split(df[df.subreddit==subreddit][['clean_text', 'subreddit']],
                                                     train_size = 8000, test_size = 2000,
                                                     random_state = 123)
        
        export_train = export_train.append(add_train)
        export_test = export_test.append(add_test)
    
    # rename subreddit column
    export_train.rename(columns={'subreddit': 'label'}, inplace=True)
    export_test.rename(columns={'subreddit': 'label'}, inplace=True)
    
    # export train sets of different sizes
    export_train.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_40k.csv', index=False)
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.5, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_20k.csv', index=False)
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.1, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_4k.csv', index=False)
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.025, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_1k.csv', index=False)
    
    # export test sets of different sizes
    export_test.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/test_{filename[9:-4]}_10k.csv', index=False)

    df = export_test.groupby('label').apply(lambda x: x.sample(frac=0.5, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/test_{filename[9:-4]}_5k.csv', index=False)

# Main function

In [10]:
def clean_split(directory, filename):
    
    filepath = os.path.join(directory, filename)
    
    # load df from csv
    df = load_csv_to_df(filepath)
    
    # clean text ("body") and write to column
    df['clean_text'] = df.body.apply(clean_text)
    
    # drop posts with empty string 
    df = drop_empty(df)
    
    # drop posts that are just [URL] or [EMOJI]
    df = drop_url_emoji(df)
    
    # drop posts that are likely not English
    df = drop_non_english(df)
    
    # drop duplicates
    df = drop_dupl(df)
    
    # export to train and test file
    split_export(df, filename)
    
    return f'wrote cleaned train and test file from {filename}'

In [11]:
%%time

# load raw data from csvs, clean it and split it into train and test sets

directory = '../../0_data/raw/labelled_reddit'


for year in ["2019"]:
    for filename in sorted(os.listdir(directory)):
        if year in filename: #.endswith("50_13.csv"):
            print(filename.upper(), '\n')
            clean_split(directory, filename)
            continue
        else:
            continue

POLITICS_2019_01.CSV 

4206486 posts, of which 126 were dropped for csv formatting errors.
4206360 posts remain. 

4206360 posts, of which 466636 were dropped for empty string content
3739724 posts remain. 

3739724 posts, of which 12357 were dropped for being just [URL] or [EMOJI]
3727367 posts remain. 





3727367 posts, of which 42634 were dropped for (most likely) not being in English.
3684733 posts remain. 

3684733 posts, of which 157638 were dropped for being duplicates.
3527095 posts remain. 

POLITICS_2019_02.CSV 

3711809 posts, of which 26 were dropped for csv formatting errors.
3711783 posts remain. 

3711783 posts, of which 403423 were dropped for empty string content
3308360 posts remain. 

3308360 posts, of which 10313 were dropped for being just [URL] or [EMOJI]
3298047 posts remain. 





3298047 posts, of which 40023 were dropped for (most likely) not being in English.
3258024 posts remain. 

3258024 posts, of which 135432 were dropped for being duplicates.
3122592 posts remain. 

POLITICS_2019_03.CSV 

3612178 posts, of which 42 were dropped for csv formatting errors.
3612136 posts remain. 

3612136 posts, of which 399530 were dropped for empty string content
3212606 posts remain. 

3212606 posts, of which 9901 were dropped for being just [URL] or [EMOJI]
3202705 posts remain. 





3202705 posts, of which 38331 were dropped for (most likely) not being in English.
3164374 posts remain. 

3164374 posts, of which 133500 were dropped for being duplicates.
3030874 posts remain. 

POLITICS_2019_04.CSV 

3148260 posts, of which 23 were dropped for csv formatting errors.
3148237 posts remain. 

3148237 posts, of which 334577 were dropped for empty string content
2813660 posts remain. 

2813660 posts, of which 8444 were dropped for being just [URL] or [EMOJI]
2805216 posts remain. 





2805216 posts, of which 33763 were dropped for (most likely) not being in English.
2771453 posts remain. 

2771453 posts, of which 115569 were dropped for being duplicates.
2655884 posts remain. 

POLITICS_2019_05.CSV 

3143960 posts, of which 11 were dropped for csv formatting errors.
3143949 posts remain. 

3143949 posts, of which 330687 were dropped for empty string content
2813262 posts remain. 

2813262 posts, of which 7823 were dropped for being just [URL] or [EMOJI]
2805439 posts remain. 





2805439 posts, of which 37134 were dropped for (most likely) not being in English.
2768305 posts remain. 

2768305 posts, of which 126452 were dropped for being duplicates.
2641853 posts remain. 

POLITICS_2019_06.CSV 

3230702 posts, of which 15 were dropped for csv formatting errors.
3230687 posts remain. 

3230687 posts, of which 401270 were dropped for empty string content
2829417 posts remain. 

2829417 posts, of which 8138 were dropped for being just [URL] or [EMOJI]
2821279 posts remain. 





2821279 posts, of which 36408 were dropped for (most likely) not being in English.
2784871 posts remain. 

2784871 posts, of which 121287 were dropped for being duplicates.
2663584 posts remain. 

POLITICS_2019_07.CSV 

3626326 posts, of which 11 were dropped for csv formatting errors.
3626315 posts remain. 

3626315 posts, of which 466657 were dropped for empty string content
3159658 posts remain. 

3159658 posts, of which 9909 were dropped for being just [URL] or [EMOJI]
3149749 posts remain. 





3149749 posts, of which 37183 were dropped for (most likely) not being in English.
3112566 posts remain. 

3112566 posts, of which 133840 were dropped for being duplicates.
2978726 posts remain. 

POLITICS_2019_08.CSV 

3444676 posts, of which 0 were dropped for csv formatting errors.
3444676 posts remain. 

3444676 posts, of which 419196 were dropped for empty string content
3025480 posts remain. 

3025480 posts, of which 8994 were dropped for being just [URL] or [EMOJI]
3016486 posts remain. 





3016486 posts, of which 33286 were dropped for (most likely) not being in English.
2983200 posts remain. 

2983200 posts, of which 191225 were dropped for being duplicates.
2791975 posts remain. 

POLITICS_2019_09.CSV 

3120800 posts, of which 0 were dropped for csv formatting errors.
3120800 posts remain. 

3120800 posts, of which 373181 were dropped for empty string content
2747619 posts remain. 

2747619 posts, of which 7708 were dropped for being just [URL] or [EMOJI]
2739911 posts remain. 





2739911 posts, of which 30268 were dropped for (most likely) not being in English.
2709643 posts remain. 

2709643 posts, of which 178921 were dropped for being duplicates.
2530722 posts remain. 

POLITICS_2019_10.CSV 

3699779 posts, of which 0 were dropped for csv formatting errors.
3699779 posts remain. 

3699779 posts, of which 436860 were dropped for empty string content
3262919 posts remain. 

3262919 posts, of which 9231 were dropped for being just [URL] or [EMOJI]
3253688 posts remain. 





3253688 posts, of which 37992 were dropped for (most likely) not being in English.
3215696 posts remain. 

3215696 posts, of which 203320 were dropped for being duplicates.
3012376 posts remain. 

POLITICS_2019_11.CSV 

3396327 posts, of which 3 were dropped for csv formatting errors.
3396324 posts remain. 

3396324 posts, of which 399089 were dropped for empty string content
2997235 posts remain. 

2997235 posts, of which 8737 were dropped for being just [URL] or [EMOJI]
2988498 posts remain. 





2988498 posts, of which 35722 were dropped for (most likely) not being in English.
2952776 posts remain. 

2952776 posts, of which 184195 were dropped for being duplicates.
2768581 posts remain. 

POLITICS_2019_12.CSV 

3379381 posts, of which 0 were dropped for csv formatting errors.
3379381 posts remain. 

3379381 posts, of which 406572 were dropped for empty string content
2972809 posts remain. 

2972809 posts, of which 7891 were dropped for being just [URL] or [EMOJI]
2964918 posts remain. 





2964918 posts, of which 33710 were dropped for (most likely) not being in English.
2931208 posts remain. 

2931208 posts, of which 165088 were dropped for being duplicates.
2766120 posts remain. 

CPU times: user 1h 1min 58s, sys: 2min 43s, total: 1h 4min 41s
Wall time: 1h 5min 55s


In [12]:
%%time

# load raw data from csvs, clean it and split it into train and test sets

directory = '../../0_data/raw/labelled_reddit'


for year in ["2017"]:
    for filename in sorted(os.listdir(directory)):
        if year in filename and "_01" not in filename and "_02" not in filename: #.endswith("50_13.csv"):
            print(filename.upper(), '\n')
            clean_split(directory, filename)
            continue
        else:
            continue

POLITICS_2017_03.CSV 

3303014 posts, of which 122 were dropped for csv formatting errors.
3302892 posts remain. 

3302892 posts, of which 292672 were dropped for empty string content
3010220 posts remain. 

3010220 posts, of which 15282 were dropped for being just [URL] or [EMOJI]
2994938 posts remain. 





2994938 posts, of which 42898 were dropped for (most likely) not being in English.
2952040 posts remain. 

2952040 posts, of which 150620 were dropped for being duplicates.
2801420 posts remain. 

POLITICS_2017_04.CSV 

2731936 posts, of which 132 were dropped for csv formatting errors.
2731804 posts remain. 

2731804 posts, of which 242901 were dropped for empty string content
2488903 posts remain. 

2488903 posts, of which 11307 were dropped for being just [URL] or [EMOJI]
2477596 posts remain. 





2477596 posts, of which 34900 were dropped for (most likely) not being in English.
2442696 posts remain. 

2442696 posts, of which 117405 were dropped for being duplicates.
2325291 posts remain. 

POLITICS_2017_05.CSV 

3437578 posts, of which 711 were dropped for csv formatting errors.
3436867 posts remain. 

3436867 posts, of which 310634 were dropped for empty string content
3126233 posts remain. 

3126233 posts, of which 15932 were dropped for being just [URL] or [EMOJI]
3110301 posts remain. 





3110301 posts, of which 41344 were dropped for (most likely) not being in English.
3068957 posts remain. 

3068957 posts, of which 159868 were dropped for being duplicates.
2909089 posts remain. 

POLITICS_2017_06.CSV 

3165048 posts, of which 771 were dropped for csv formatting errors.
3164277 posts remain. 

3164277 posts, of which 278264 were dropped for empty string content
2886013 posts remain. 

2886013 posts, of which 13024 were dropped for being just [URL] or [EMOJI]
2872989 posts remain. 





2872989 posts, of which 37151 were dropped for (most likely) not being in English.
2835838 posts remain. 

2835838 posts, of which 137284 were dropped for being duplicates.
2698554 posts remain. 

POLITICS_2017_07.CSV 

2987901 posts, of which 2037 were dropped for csv formatting errors.
2985864 posts remain. 

2985864 posts, of which 256067 were dropped for empty string content
2729797 posts remain. 

2729797 posts, of which 12225 were dropped for being just [URL] or [EMOJI]
2717572 posts remain. 





2717572 posts, of which 33097 were dropped for (most likely) not being in English.
2684475 posts remain. 

2684475 posts, of which 128233 were dropped for being duplicates.
2556242 posts remain. 

POLITICS_2017_08.CSV 

3154713 posts, of which 395 were dropped for csv formatting errors.
3154318 posts remain. 

3154318 posts, of which 287889 were dropped for empty string content
2866429 posts remain. 

2866429 posts, of which 12488 were dropped for being just [URL] or [EMOJI]
2853941 posts remain. 





2853941 posts, of which 32396 were dropped for (most likely) not being in English.
2821545 posts remain. 

2821545 posts, of which 129894 were dropped for being duplicates.
2691651 posts remain. 

POLITICS_2017_09.CSV 

2572942 posts, of which 554 were dropped for csv formatting errors.
2572388 posts remain. 

2572388 posts, of which 217458 were dropped for empty string content
2354930 posts remain. 

2354930 posts, of which 9210 were dropped for being just [URL] or [EMOJI]
2345720 posts remain. 





2345720 posts, of which 27318 were dropped for (most likely) not being in English.
2318402 posts remain. 

2318402 posts, of which 183248 were dropped for being duplicates.
2135154 posts remain. 

POLITICS_2017_10.CSV 

2943544 posts, of which 104 were dropped for csv formatting errors.
2943440 posts remain. 

2943440 posts, of which 256781 were dropped for empty string content
2686659 posts remain. 

2686659 posts, of which 11715 were dropped for being just [URL] or [EMOJI]
2674944 posts remain. 





2674944 posts, of which 31713 were dropped for (most likely) not being in English.
2643231 posts remain. 

2643231 posts, of which 122072 were dropped for being duplicates.
2521159 posts remain. 

POLITICS_2017_11.CSV 

2897622 posts, of which 83 were dropped for csv formatting errors.
2897539 posts remain. 

2897539 posts, of which 250902 were dropped for empty string content
2646637 posts remain. 

2646637 posts, of which 11121 were dropped for being just [URL] or [EMOJI]
2635516 posts remain. 





2635516 posts, of which 30881 were dropped for (most likely) not being in English.
2604635 posts remain. 

2604635 posts, of which 116869 were dropped for being duplicates.
2487766 posts remain. 

POLITICS_2017_12.CSV 

2960119 posts, of which 12 were dropped for csv formatting errors.
2960107 posts remain. 

2960107 posts, of which 248187 were dropped for empty string content
2711920 posts remain. 

2711920 posts, of which 11636 were dropped for being just [URL] or [EMOJI]
2700284 posts remain. 





2700284 posts, of which 30933 were dropped for (most likely) not being in English.
2669351 posts remain. 

2669351 posts, of which 115320 were dropped for being duplicates.
2554031 posts remain. 

CPU times: user 43min 24s, sys: 34 s, total: 43min 58s
Wall time: 44min 8s


In [13]:
%%time

# load raw data from csvs, clean it and split it into train and test sets

directory = '../../0_data/raw/labelled_reddit'


for year in ["2018"]:
    for filename in sorted(os.listdir(directory)):
        if year in filename and "_01" not in filename and "_02" not in filename and "_03" not in filename and "_04" not in filename and "_05" not in filename: #.endswith("50_13.csv"):
            print(filename.upper(), '\n')
            clean_split(directory, filename)
            continue
        else:
            continue

POLITICS_2018_06.CSV 

3323822 posts, of which 18 were dropped for csv formatting errors.
3323804 posts remain. 

3323804 posts, of which 323325 were dropped for empty string content
3000479 posts remain. 

3000479 posts, of which 10891 were dropped for being just [URL] or [EMOJI]
2989588 posts remain. 





2989588 posts, of which 34594 were dropped for (most likely) not being in English.
2954994 posts remain. 

2954994 posts, of which 124582 were dropped for being duplicates.
2830412 posts remain. 

POLITICS_2018_07.CSV 

3401185 posts, of which 15 were dropped for csv formatting errors.
3401170 posts remain. 

3401170 posts, of which 343222 were dropped for empty string content
3057948 posts remain. 

3057948 posts, of which 10793 were dropped for being just [URL] or [EMOJI]
3047155 posts remain. 





3047155 posts, of which 36792 were dropped for (most likely) not being in English.
3010363 posts remain. 

3010363 posts, of which 130880 were dropped for being duplicates.
2879483 posts remain. 

POLITICS_2018_08.CSV 

3246577 posts, of which 11 were dropped for csv formatting errors.
3246566 posts remain. 

3246566 posts, of which 326059 were dropped for empty string content
2920507 posts remain. 

2920507 posts, of which 10173 were dropped for being just [URL] or [EMOJI]
2910334 posts remain. 





2910334 posts, of which 33441 were dropped for (most likely) not being in English.
2876893 posts remain. 

2876893 posts, of which 124509 were dropped for being duplicates.
2752384 posts remain. 

POLITICS_2018_09.CSV 

3675475 posts, of which 18 were dropped for csv formatting errors.
3675457 posts remain. 

3675457 posts, of which 360601 were dropped for empty string content
3314856 posts remain. 

3314856 posts, of which 10788 were dropped for being just [URL] or [EMOJI]
3304068 posts remain. 





3304068 posts, of which 39317 were dropped for (most likely) not being in English.
3264751 posts remain. 

3264751 posts, of which 143181 were dropped for being duplicates.
3121570 posts remain. 

POLITICS_2018_10.CSV 

3960055 posts, of which 27 were dropped for csv formatting errors.
3960028 posts remain. 

3960028 posts, of which 424540 were dropped for empty string content
3535488 posts remain. 

3535488 posts, of which 11844 were dropped for being just [URL] or [EMOJI]
3523644 posts remain. 





3523644 posts, of which 45740 were dropped for (most likely) not being in English.
3477904 posts remain. 

3477904 posts, of which 165659 were dropped for being duplicates.
3312245 posts remain. 

POLITICS_2018_11.CSV 

3779792 posts, of which 150 were dropped for csv formatting errors.
3779642 posts remain. 

3779642 posts, of which 394409 were dropped for empty string content
3385233 posts remain. 

3385233 posts, of which 10719 were dropped for being just [URL] or [EMOJI]
3374514 posts remain. 





3374514 posts, of which 42068 were dropped for (most likely) not being in English.
3332446 posts remain. 

3332446 posts, of which 148790 were dropped for being duplicates.
3183656 posts remain. 

POLITICS_2018_12.CSV 

3200995 posts, of which 69 were dropped for csv formatting errors.
3200926 posts remain. 

3200926 posts, of which 334161 were dropped for empty string content
2866765 posts remain. 

2866765 posts, of which 9166 were dropped for being just [URL] or [EMOJI]
2857599 posts remain. 





2857599 posts, of which 34363 were dropped for (most likely) not being in English.
2823236 posts remain. 

2823236 posts, of which 117015 were dropped for being duplicates.
2706221 posts remain. 

CPU times: user 35min 11s, sys: 41.4 s, total: 35min 52s
Wall time: 36min
