# Clean and Split Monthly Unlabelled Reddit (News) Corpora

In [1]:
import csv
import random
import os

# Helper functions

In [2]:
def load_from_txt(filepath, month_sample_size):
    with open(filepath,'r', newline='') as f:
            lines = f.read().rstrip('\r\n').splitlines()
            lines = [x for x in lines if x.strip()]
            random.seed(123)
            sample = random.sample(lines, month_sample_size) # set month_sample_size to save RAM since we are only sampling a fraction of each month anyway
    return sample

In [3]:
def create_sample(source_dir, mode, size, label):
    
    comment_list = []
    
    for filename in sorted(os.listdir(source_dir)):
        if mode in filename and filename.endswith(".txt"):
            print(f"  loading subsample from {filename}")
            comment_list += load_from_txt(os.path.join(source_dir, filename), month_sample_size = int(round(size/36, 0)+1)) # divide by 36 to stratify across months
            
    with open(f'../../0_data/clean/unlabelled_reddit/total/{mode}_rand_{label}.txt', 'w') as write_obj:
        random.seed(123)
        print("  writing to text file")
        for text in random.sample(comment_list, size):
            write_obj.write(text + "\n \n")

# Creating random training and test sets

In [4]:
%%time

# create training sets
mode = 'train'

for size, label in [(10000000, '10m')]: #(1000000, '1m'), (2000000, '2m'), (5000000, '5m'), 
    print(f"creating random {mode} set, size {label}")
    create_sample(source_dir = '../../0_data/clean/unlabelled_reddit/month_splits', mode = mode, size = size, label = label)

creating random train set, size 10m
  loading subsample from train_2017_03_1m.txt
  loading subsample from train_2017_04_1m.txt
  loading subsample from train_2017_05_1m.txt
  loading subsample from train_2017_06_1m.txt
  loading subsample from train_2017_07_1m.txt
  loading subsample from train_2017_08_1m.txt
  loading subsample from train_2017_09_1m.txt
  loading subsample from train_2017_10_1m.txt
  loading subsample from train_2017_11_1m.txt
  loading subsample from train_2017_12_1m.txt
  loading subsample from train_2018_01_1m.txt
  loading subsample from train_2018_02_1m.txt
  loading subsample from train_2018_03_1m.txt
  loading subsample from train_2018_04_1m.txt
  loading subsample from train_2018_05_1m.txt
  loading subsample from train_2018_06_1m.txt
  loading subsample from train_2018_07_1m.txt
  loading subsample from train_2018_08_1m.txt
  loading subsample from train_2018_09_1m.txt
  loading subsample from train_2018_10_1m.txt
  loading subsample from train_2018_11_1m.tx

In [4]:
%%time

# create test sets
mode = 'test'

for size, label in [(10000, '10k'), (20000, '20k'), (50000, '50k')]:
    print(f"creating random {mode} set, size {label}")
    create_sample(source_dir = '../../0_data/clean/unlabelled_reddit/month_splits', mode = mode, size = size, label = label)

random test set, size 10k
random test set, size 20k
random test set, size 50k
