# Clean and Split Monthly Unlabelled Reddit (News) Corpora

In [1]:
import csv
import random
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Helper functions

In [2]:
def load_from_csv(filepath, month_sample_size):
    import_df = pd.read_csv(filepath)
    sample_df, _ = train_test_split(import_df, train_size = month_sample_size, stratify=import_df.label, random_state=123)
    return sample_df

In [3]:
def create_sample(source_dir, mode, size, label):
    
    comment_df = pd.DataFrame(columns = ['clean_text', 'label'])
    
    for filename in sorted(os.listdir(source_dir)):
        if mode in filename and filename.endswith("40k.csv" if mode=='train' else "10k.csv"):
            print(f"  loading subsample from {filename}")
            comment_df = comment_df.append(load_from_csv(os.path.join(source_dir, filename), month_sample_size = int(round(size/36, 0)+1))) # divide by 36 to stratify across months
    
    export_df, _ = train_test_split(comment_df, train_size = size, stratify = comment_df.label, random_state = 123)
    
    export_df.to_csv(f'../../0_data/clean/labelled_reddit/total/{mode}_rand_{label}.csv')

# Creating random training and test sets

In [4]:
%%time

# create training sets
mode = 'train'

for size, label in [(1000, '1k'), (4000, '4k'), (20000, '20k'), (40000, '40k'), (80000, '80k'), (160000, '160k'), (320000, '320k'), (640000, '640k')]:
    print(f"creating random {mode} set, size {label}")
    create_sample(source_dir = '../../0_data/clean/labelled_reddit/month_splits', mode = mode, size = size, label = label)

creating random train set, size 1k
  loading subsample from train_2017_03_40k.csv
  loading subsample from train_2017_04_40k.csv
  loading subsample from train_2017_05_40k.csv
  loading subsample from train_2017_06_40k.csv
  loading subsample from train_2017_07_40k.csv
  loading subsample from train_2017_08_40k.csv
  loading subsample from train_2017_09_40k.csv
  loading subsample from train_2017_10_40k.csv
  loading subsample from train_2017_11_40k.csv
  loading subsample from train_2017_12_40k.csv
  loading subsample from train_2018_01_40k.csv
  loading subsample from train_2018_02_40k.csv
  loading subsample from train_2018_03_40k.csv
  loading subsample from train_2018_04_40k.csv
  loading subsample from train_2018_05_40k.csv
  loading subsample from train_2018_06_40k.csv
  loading subsample from train_2018_07_40k.csv
  loading subsample from train_2018_08_40k.csv
  loading subsample from train_2018_09_40k.csv
  loading subsample from train_2018_10_40k.csv
  loading subsample from 

In [6]:
%%time

# create test sets
mode = 'test'

for size, label in [(1000, '1k'), (5000, '5k'), (10000, '10k'), (20000, '20k'), (40000, '40k')]:
    print(f"creating random {mode} set, size {label}")
    create_sample(source_dir = '../../0_data/clean/labelled_reddit/month_splits', mode = mode, size = size, label = label)

creating random test set, size 1k
  loading subsample from test_2017_03_10k.csv
  loading subsample from test_2017_04_10k.csv
  loading subsample from test_2017_05_10k.csv
  loading subsample from test_2017_06_10k.csv
  loading subsample from test_2017_07_10k.csv
  loading subsample from test_2017_08_10k.csv
  loading subsample from test_2017_09_10k.csv
  loading subsample from test_2017_10_10k.csv
  loading subsample from test_2017_11_10k.csv
  loading subsample from test_2017_12_10k.csv
  loading subsample from test_2018_01_10k.csv
  loading subsample from test_2018_02_10k.csv
  loading subsample from test_2018_03_10k.csv
  loading subsample from test_2018_04_10k.csv
  loading subsample from test_2018_05_10k.csv
  loading subsample from test_2018_06_10k.csv
  loading subsample from test_2018_07_10k.csv
  loading subsample from test_2018_08_10k.csv
  loading subsample from test_2018_09_10k.csv
  loading subsample from test_2018_10_10k.csv
  loading subsample from test_2018_11_10k.csv
