# Clean and Split Monthly Unlabelled Reddit (News) Corpora

In [1]:
import datetime as dt
import csv
import re
import fasttext
import emoji
import pandas as pd
import unicodedata
import os

from html import unescape
from sklearn.model_selection import train_test_split

# Helper functions

In [11]:
def load_csv_to_df(filepath):

    df = pd.read_csv(filepath)
    
    #with open(os.path.join(directory, filename),'r', newline='') as f:
    #    reader = csv.reader(f)
    #    cols = [next(reader)]
    #    df = pd.DataFrame(line for line in reader)
    #    df.columns = cols
    #    df.columns = df.columns.get_level_values(0)
        
    # small number of csv formatting errors --> delete

    # save number of documents
    n_docs = df.shape[0]

    # drop posts with formatting errors
    df.dropna(inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for csv formatting errors.')
    print(f'{df.shape[0]} posts remain. \n')
    
    # convert epoch to readable datetime
    df.created_utc = df.created_utc.apply(lambda x: dt.datetime.fromtimestamp(int(x)))
    
    return df

In [3]:
def clean_text(text, max_length = 1024):

    # convert html
    text = unescape(text)
    
    # clean unicode formatting errors
    text = unicodedata.normalize("NFKD", text)
    text = text.replace('\u200d', '')
    
    # truncate text to max_length
    text = text[:max_length]
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # replace URLs and emojis with special tokens
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # remove deleted posts
    text = text.replace('[deleted]','')
    text = text.replace('[removed]','')
    
    # remove leading ">" (reddit artifact)
    text = text.lstrip('>')
    
    # remove The_Donald bots
    if "the Andromeda Galaxy" in text or "This bot was created" in text:
        text = ""
        
    # remove politics bots
    if "I'm a bot" in text:
        text = ""
        
    # remove chapo / libertarian bots
    if "^bot" in text or "transcribing bot" in text or "isbot " in text:
        text = ""
        
    # remove libertarian bots
    if "This bot wants to" in text:
        text = ""
    
    # collapse whitespace into single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # truncate text to max_length again
    text = text[:max_length]
    
    # remove leading and trailing whitespaces
    text = text.strip()

    return text

In [4]:
def drop_empty(df):
    
    # save number of documents before dropping empty posts
    n_docs = df.shape[0]

    # drop rows with empty text
    df = df[df.clean_text.values!=""]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for empty string content')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [5]:
def drop_url_emoji(df):

    # save number of documents before dropping posts that are just [URL] or [EMOJI]
    n_docs = df.shape[0]

    # drop rows with text that is just [URL] or [EMOJI]
    df = df[(df.clean_text!="[URL]") & (df.clean_text!="[EMOJI]")]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being just [URL] or [EMOJI]')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [6]:
def drop_non_english(df):
    
    # load language classifier
    fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

    def check_language(text):
        predictions = fmodel.predict(text, k=3)

        # if top prediction is certain and not English, return non-English
        if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
            return 'non-English'

        # else if English is one of top 3 predictions, return English
        elif '__label__en' in predictions[0]:
            return 'English'

        # else return non-English
        else:
            return 'non-English'

    # save number of documents before dropping non-English posts
    n_docs = df.shape[0]

    # drop non-English posts
    df = df[df.clean_text.apply(lambda x: check_language(x) == 'English')]

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for (most likely) not being in English.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [7]:
def drop_dupl(df):

    # save number of documents before dropping duplicates
    n_docs = df.shape[0]

    # drop duplicates
    df.drop_duplicates(subset = ['clean_text'], inplace=True)

    print(f'{n_docs} posts, of which {n_docs - df.shape[0]} were dropped for being duplicates.')
    print(f'{df.shape[0]} posts remain. \n')
    
    return df

In [8]:
# create train and test splits for each month-year
    
def split_export(df, filename):
    
    export_train = pd.DataFrame(columns=['clean_text', 'subreddit'])
    export_test = export_train.copy()
    
    for subreddit in pd.unique(df.subreddit):
        
        add_train, add_test = train_test_split(df[df.subreddit==subreddit][['clean_text', 'subreddit']],
                                                     train_size = 8000, test_size = 2000,
                                                     random_state = 123)
        
        export_train = export_train.append(add_train)
        export_test = export_test.append(add_test)
    
    # rename subreddit column
    export_train.rename(columns={'subreddit': 'label'}, inplace=True)
    export_test.rename(columns={'subreddit': 'label'}, inplace=True)
    
    # export train sets of different sizes
    export_train.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_40k.csv', index=False)
    print(f'train set - unique labels: {pd.unique(export_train.label)}')
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.5, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_20k.csv', index=False)
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.1, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_4k.csv', index=False)
    
    df = export_train.groupby('label').apply(lambda x: x.sample(frac=0.025, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/train_{filename[9:-4]}_1k.csv', index=False)
    
    # export test sets of different sizes
    export_test.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/test_{filename[9:-4]}_10k.csv', index=False)
    print(f'test set - unique labels: {pd.unique(export_train.label)}')
    
    df = export_test.groupby('label').apply(lambda x: x.sample(frac=0.5, random_state=123))
    df.index = df.index.droplevel(0)
    df.sample(frac=1, random_state=123).to_csv(f'../../0_data/clean/labelled_reddit/month_splits/test_{filename[9:-4]}_5k.csv', index=False)

# Main function

In [9]:
def clean_split(directory, filename):
    
    filepath = os.path.join(directory, filename)
    
    # load df from csv
    df = load_csv_to_df(filepath)
    
    # clean text ("body") and write to column
    df['clean_text'] = df.body.apply(clean_text)
    
    # drop posts with empty string 
    df = drop_empty(df)
    
    # drop posts that are just [URL] or [EMOJI]
    df = drop_url_emoji(df)
    
    # drop posts that are likely not English
    df = drop_non_english(df)
    
    # drop duplicates
    df = drop_dupl(df)
    
    # export to train and test file
    split_export(df, filename)
    
    return f'wrote cleaned train and test file from {filename}'

In [None]:
%%time

# load raw data from csvs, clean it and split it into train and test sets

directory = '../../0_data/raw/labelled_reddit'


for year in ["politics_2020_01"]:
    for filename in sorted(os.listdir(directory)):
        if year in filename: #.endswith("50_13.csv"):
            print(filename.upper(), '\n')
            clean_split(directory, filename)
            continue
        else:
            continue

POLITICS_2020_01.CSV 

3676764 posts, of which 9 were dropped for csv formatting errors.
3676755 posts remain. 

3676755 posts, of which 220945 were dropped for empty string content
3455810 posts remain. 

3455810 posts, of which 12048 were dropped for being just [URL] or [EMOJI]
3443762 posts remain. 





3443762 posts, of which 36022 were dropped for (most likely) not being in English.
3407740 posts remain. 

3407740 posts, of which 203866 were dropped for being duplicates.
3203874 posts remain. 

number of unique labels: ['politics' 'The_Donald' 'Libertarian' 'ChapoTrapHouse' 'Conservative']
number of unique labels: ['politics' 'The_Donald' 'Libertarian' 'ChapoTrapHouse' 'Conservative']
SUB_POLITICS_2020_01.CSV 



ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/Paul/opt/miniconda3/envs/language_change/lib/python3.8/site-packages/IPython/core/magics/execution.py", line 1321, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 10, in <module>
  File "<ipython-input-9-4a0c74985fca>", line 6, in clean_split
    df = load_csv_to_df(filepath)
  File "<ipython-input-11-62451ca7f133>", line 3, in load_csv_to_df
    df = pd.read_csv(filepath)
  File "/Users/Paul/opt/miniconda3/envs/language_change/lib/python3.8/site-packages/pandas/io/parsers.py", line 610, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/Users/Paul/opt/miniconda3/envs/language_change/lib/python3.8/site-packages/pandas/io/parsers.py", line 468, in _read
    return parser.read(nrows)
  File "/Users/Paul/opt/miniconda3/envs/language_change/lib/python3.8/site-packages/pandas/io/parsers.py", line 1057, in read
    index, columns, col_dict = self._engine.read(nrows)
  File "/Users/Paul/opt/miniconda3/envs/langu

TypeError: object of type 'NoneType' has no len()