In [1]:
import datetime as dt
import csv
import re
import fasttext
import emoji
import pandas as pd
import unicodedata

from psaw import PushshiftAPI
from html import unescape
from sklearn.model_selection import train_test_split

In [2]:
api = PushshiftAPI()

In [3]:
%%time

# initialise empty df for writing into
sample_df = pd.DataFrame(columns = ['created_utc', 'body', 'subreddit'])

for subreddit in ['the_donald', 'libertarian', 'conservative', 'politics', 'chapotraphouse']:
    print(subreddit.upper())
    for year, month in [(2017, 8), (2018, 2), (2018, 8), (2019, 2), (2019, 8), (2020, 2)]:
        print(year, month)
        for day in range(1, 29):
            sample_time_epoch = int(dt.datetime(year = year, month = month, day = day, hour = 12, minute = 0 ).timestamp())

            sample_df = sample_df.append(pd.DataFrame(api.search_comments(before = sample_time_epoch,
                                                                          subreddit = subreddit,
                                                                          filter = ['body', 'subreddit'],
                                                                          limit = 300))[['created_utc', 'body', 'subreddit']])
    print()

THE_DONALD
2017 8
2018 2
2018 8
2019 2
2019 8
2020 2

LIBERTARIAN
2017 8
2018 2
2018 8
2019 2
2019 8
2020 2

CONSERVATIVE
2017 8
2018 2




2018 8
2019 2
2019 8
2020 2

POLITICS
2017 8
2018 2
2018 8
2019 2
2019 8
2020 2

CHAPOTRAPHOUSE
2017 8
2018 2
2018 8
2019 2
2019 8
2020 2

CPU times: user 1min 46s, sys: 8.34 s, total: 1min 54s
Wall time: 51min 54s


In [4]:
reddit_df = sample_df.copy()

In [5]:
# convert epoch to readable datetime
reddit_df.created_utc = reddit_df.created_utc.apply(lambda x: dt.datetime.fromtimestamp(x))

# create monthyear column for easier sorting
reddit_df['monthyear'] = reddit_df.created_utc.apply(lambda x: x.to_period('M'))

In [6]:
# sanity checks:
def sanity_check(df):    
    print(df.groupby('subreddit').subreddit.count())
    print()
    print(df.groupby('monthyear').monthyear.count())
    print()
    print(df.groupby(['monthyear', 'subreddit']).monthyear.count())
    print()
    print(df.groupby(df.created_utc.apply(lambda x: x.to_period('D'))).created_utc.count())
    
sanity_check(reddit_df)

subreddit
ChapoTrapHouse    50400
Conservative      50400
Libertarian       50400
The_Donald        50400
politics          50400
Name: subreddit, dtype: int64

monthyear
2017-08    42000
2018-02    42000
2018-08    42000
2019-02    42000
2019-08    42000
2020-02    42000
Freq: M, Name: monthyear, dtype: int64

monthyear  subreddit     
2017-08    ChapoTrapHouse    8400
           Conservative      8400
           Libertarian       8400
           The_Donald        8400
           politics          8400
2018-02    ChapoTrapHouse    8400
           Conservative      8400
           Libertarian       8400
           The_Donald        8400
           politics          8400
2018-08    ChapoTrapHouse    8400
           Conservative      8400
           Libertarian       8400
           The_Donald        8400
           politics          8400
2019-02    ChapoTrapHouse    8400
           Conservative      8400
           Libertarian       8400
           The_Donald        8400
           poli

## Clean text
with cutoff at max_length to avoid super long / spam comments

In [7]:
%%time

# Define function to clean text
def clean(text, max_length = 1024):

    # convert html
    text = unescape(text)
    
    # clean unicode formatting errors
    text = unicodedata.normalize("NFKD", text)
    text = text.replace('\u200d', '')
    
    # truncate text to max_length
    text = text[:max_length]
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # replace URLs and emojis with special tokens
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # remove deleted posts
    text = text.replace('[removed]','')
        
    # collapse whitespace into single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # truncate text to max_length again
    text = text[:max_length]
    
    # remove leading and trailing whitespaces
    text = text.strip()

    return text

# create clean_text column
reddit_df['clean_text'] = reddit_df.body.apply(clean)

CPU times: user 10.5 s, sys: 60.7 ms, total: 10.5 s
Wall time: 10.6 s


In [8]:
%%time 

# save number of documents before dropping empty posts
n_docs = reddit_df.shape[0]

# drop rows with empty text
reddit_df = reddit_df[reddit_df.clean_text.values!=""]

print(f'{n_docs} posts, of which {n_docs - reddit_df.shape[0]} were dropped for empty string content')
print(f'{reddit_df.shape[0]} posts remain. \n')

252000 posts, of which 12362 were dropped for empty string content
239638 posts remain. 

CPU times: user 87.6 ms, sys: 10.1 ms, total: 97.7 ms
Wall time: 98.2 ms


In [9]:
%%time

# save number of documents before dropping posts that are just [URL] or [EMOJI]
n_docs = reddit_df.shape[0]

# drop rows with text that is just [URL] or [EMOJI]
reddit_df = reddit_df[(reddit_df.clean_text!="[URL]") & (reddit_df.clean_text!="[EMOJI]")]

print(f'{n_docs} posts, of which {n_docs - reddit_df.shape[0]} were dropped for being just [URL] or [EMOJI]')
print(f'{reddit_df.shape[0]} posts remain. \n')

239638 posts, of which 989 were dropped for being just [URL] or [EMOJI]
238649 posts remain. 

CPU times: user 92.5 ms, sys: 4.22 ms, total: 96.7 ms
Wall time: 95.9 ms


In [10]:
%%time

# check language
fmodel = fasttext.load_model('../../0_models/lang_detect/lid.176.bin')

def check_language(text):
    predictions = fmodel.predict(text, k=3)
    
    # if top prediction is certain and not English, return non-English
    if (predictions[0][0]!='__label__en') and (predictions[1][0]>0.50):
        return 'non-English'
    
    # else if English is one of top 3 predictions, return English
    elif '__label__en' in predictions[0]:
        return 'English'
    
    # else return non-English
    else:
        return 'non-English'

# save number of documents before dropping non-English posts
n_docs = reddit_df.shape[0]

# drop non-English posts
reddit_df = reddit_df[reddit_df.clean_text.apply(lambda x: check_language(x) == 'English')]

print(f'{n_docs} posts, of which {n_docs - reddit_df.shape[0]} were dropped for (most likely) not being in English.')
print(f'{reddit_df.shape[0]} posts remain. \n')



238649 posts, of which 2719 were dropped for (most likely) not being in English.
235930 posts remain. 

CPU times: user 9.69 s, sys: 116 ms, total: 9.8 s
Wall time: 9.9 s


In [11]:
# save number of documents before dropping duplicates
n_docs = reddit_df.shape[0]

# drop duplicates
reddit_df.drop_duplicates(subset = ['clean_text'], inplace=True)

print(f'{n_docs} posts, of which {n_docs - reddit_df.shape[0]} were dropped for being duplicates.')
print(f'{reddit_df.shape[0]} posts remain. \n')

235930 posts, of which 14221 were dropped for being duplicates.
221709 posts remain. 



## Export to CSVs

In [18]:
# create train and test splits for each month-year
for monthyear in pd.unique(reddit_df['monthyear']):
    
    export_train = pd.DataFrame(columns=['clean_text', 'subreddit'])
    export_test = export_train.copy()
    
    for subreddit in pd.unique(reddit_df['subreddit']):
        
        add_train, add_test = train_test_split(reddit_df[(reddit_df['monthyear']==monthyear)&(reddit_df['subreddit']==subreddit)][['clean_text', 'subreddit']],
                                                     train_size = 4000, test_size = 1000,
                                                     random_state = 123)
        
        export_train = export_train.append(add_train)
        export_test = export_test.append(add_test)
        
    export_train.sample(frac=1).rename(columns={'subreddit': 'label'}).to_csv(f'../../0_data/clean/labelled_reddit/train-{str(monthyear)[-5:]}.csv', index=False)
    export_test.sample(frac=1).rename(columns={'subreddit': 'label'}).to_csv(f'../../0_data/clean/labelled_reddit/test-{str(monthyear)[-5:]}.csv', index=False)

In [13]:
print(reddit_df.groupby(['monthyear', 'subreddit']).monthyear.count())

monthyear  subreddit     
2017-08    ChapoTrapHouse    7559
           Conservative      7097
           Libertarian       7764
           The_Donald        7256
           politics          7308
2018-02    ChapoTrapHouse    7497
           Conservative      6867
           Libertarian       7797
           The_Donald        7434
           politics          7423
2018-08    ChapoTrapHouse    7015
           Conservative      6406
           Libertarian       7647
           The_Donald        7187
           politics          7062
2019-02    ChapoTrapHouse    7584
           Conservative      6664
           Libertarian       8152
           The_Donald        7995
           politics          7579
2019-08    ChapoTrapHouse    7287
           Conservative      6773
           Libertarian       8035
           The_Donald        7046
           politics          7496
2020-02    ChapoTrapHouse    7912
           Conservative      6994
           Libertarian       8137
           The_Donald 