In [129]:
import pandas as pd
import re
import datetime as dt
import csv
from os import listdir
from os.path import isfile, join


# Start date = date that r/trp was created
start = dt.datetime.strptime('2012-10-25', '%Y-%m-%d').date()

# End date = date that r/trp was quarantined
end = dt.datetime.strptime('2018-09-01', '%Y-%m-%d').date()

def clean_comments(subreddit_folder):
    """Cleans the file passed in as the first argument based on data type.
    Returns the column headers we can expect to see in the saved file."""

    ## MAYBE REPLACE subreddit_folderS WITH SUBREDDIT AND MOVE GLOB INTO HERE
    ## or split into clean comments and clean submissions since comments are so much bigger


    # Get date for subreddit_folder
    today = dt.datetime.utcnow().date()

    regex = r"([^\/]+)(?=\-all)"
    matches = re.search(regex, subreddit_folder)
    new_file = matches.group()
    
    # Comment files
    comment_files = [f for f in listdir(subreddit_folder) if isfile(join(subreddit_folder, f))]

    
    
    # Create list of columns to keep
    keep_cols = ['id', 'created_utc','author',\
                  'author_flair_text', 'score', 'parent_id',\
                  'subreddit']
    keep_cols_text = ['id', 'created_utc', 'parent_id', 'body']

    # Create file name
    processedfile_csv = "data/processed/comments/" + new_file + \
        "-metadata" +  ".csv"

    processed_textfile_csv = "data/processed/comments/" + new_file + \
        "-text" + ".csv"


    df_keep = pd.DataFrame()
    df_keep_text = pd.DataFrame()

    counter = 0
    
    # Read in json file
    for i in comment_files: 
        counter += 1
    
        file_path = subreddit_folder + "/" + i
        
        try:
            data = pd.read_json(file_path)

        # ValueError: Trailing data thrown if file is pretty indented
        except ValueError:
            data = pd.read_json(file_path, lines = True)

        try:
            df_keep = df_keep.append(data[keep_cols])
        except KeyError:
            keep_cols = ['id', 'created_utc', 'author', 'title',\
                        'score', 'num_comments', 'subreddit']
            df_keep = df_keep.append(data[keep_cols])

        try:
            df_keep_text = df_keep_text.append(data[keep_cols_text])
        except KeyError:
            keep_cols_text = ['id', 'created_utc', 'author']
            df_keep_text = df_keep_text.append(data[keep_cols_text])
        
        # Make sure there's at least 1 observation
        observations = len(df_keep)
    
        # Change date format
        ## For metadata
        if observations == 0:
            print("No comments found in " + i)
            continue
            
        else:
            df_keep['datetime_dv'] = pd.to_datetime(df_keep['created_utc'], unit = 's')# dv = derived
            df_keep['date_dv'] = df_keep['datetime_dv'].dt.date

            # For text
            df_keep_text['datetime_dv'] = pd.to_datetime(df_keep_text['created_utc'], unit = 's')# dv = derived
            df_keep_text['date_dv'] = df_keep_text['datetime_dv'].dt.date


        ##### Delimit by date #####
        # TODO: break this out into different function
        # Create mask of time slot
        mask = (df_keep['date_dv'] >= start) & (df_keep['date_dv'] <= end) # inclusive on either end
        mask_text = (df_keep_text['date_dv'] >= start) & (df_keep_text['date_dv'] <= end)

        # Only keep data within date frame
        df_keep = df_keep.loc[mask]
        df_keep_text = df_keep_text.loc[mask_text]
        ############################


        # Save to csv
        if counter == 1: 
            df_keep.to_csv(processedfile_csv, mode = "a") 
            df_keep_text.to_csv(processed_textfile_csv, mode = "a")
            
        else:
            df_keep.to_csv(processedfile_csv, mode = "a", header = False) 
            df_keep_text.to_csv(processed_textfile_csv, mode = "a", header = False)            
        
#         df_keep = []
#         df_keep_text = []


    return keep_cols

## TODO: update so that i'm adding to comment file in the same way i scrape it for the big r/TRP scrape
# with open(processedfile_csv, 'a', encoding = 'utf-8') as fp:
#         json.dump(obj.d_, fp, ensure_ascii = False) # write file
#         fp.write('\n')


In [5]:
# Create subdirectory for each subreddit

## Step 1: Get names for new folders
from glob import glob
comment_files = glob("data/raw/comments/*.json")

regex = r"([^\/]+)(?=\.json$)"

filenames = []

for i in comment_files:
    matches = re.search(regex, i)
    new_file = matches.group()
    filenames.append(new_file)

In [6]:
filenames

['EthnicRedPill-allcomments-2020-08-20',
 'RedPillParenting-allcomments-2020-08-20',
 'GEOTRP-allcomments-2020-08-20',
 'FeMRADebates-allcomments-2020-08-20',
 'NOMAAM-allcomments-2020-08-21',
 'MensRights-allcomments-2020-08-20',
 'Egalitarianism-allcomments-2020-08-20',
 'AskFeminists-allcomments-2020-08-20',
 'MGTOW-allcomments-2020-08-20',
 'TRPOffTopic-allcomments-2020-08-20',
 'RedPillNonMonogamy-allcomments-2020-08-20',
 'askseddit-allcomments-2020-08-20',
 'badwomensanatomy-allcomments-2020-08-20',
 'TRPmemes-allcomments-2020-08-20',
 'IncelsWithoutHate-allcomments-2020-08-20',
 'RedPillLit-allcomments-2020-08-20',
 'MRActivism-allcomments-2020-08-21',
 'altTRP-allcomments-2020-08-20',
 'marriedredpill-allcomments-2020-08-20',
 'PurplePillDebate-allcomments-2020-08-21',
 'askTRP-allcomments-2020-08-19',
 'seduction-allcomments-2020-08-21',
 'Trufemcels-allcomments-2020-08-22',
 'mgtowbooks-allcomments-2020-08-21',
 'redpillfatherhood-allcomments-2020-08-20',
 'theRedPillLeft-al

In [11]:
## Step 2: Create directories with file names
import os

# define the name of the directory to be created
for i in filenames:
    path = "data/raw/comments/" + i

    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s" % path)

Successfully created the directory data/raw/comments/EthnicRedPill-allcomments-2020-08-20
Successfully created the directory data/raw/comments/RedPillParenting-allcomments-2020-08-20
Successfully created the directory data/raw/comments/GEOTRP-allcomments-2020-08-20
Successfully created the directory data/raw/comments/FeMRADebates-allcomments-2020-08-20
Successfully created the directory data/raw/comments/NOMAAM-allcomments-2020-08-21
Successfully created the directory data/raw/comments/MensRights-allcomments-2020-08-20
Successfully created the directory data/raw/comments/Egalitarianism-allcomments-2020-08-20
Successfully created the directory data/raw/comments/AskFeminists-allcomments-2020-08-20
Successfully created the directory data/raw/comments/MGTOW-allcomments-2020-08-20
Successfully created the directory data/raw/comments/TRPOffTopic-allcomments-2020-08-20
Successfully created the directory data/raw/comments/RedPillNonMonogamy-allcomments-2020-08-20
Successfully created the direc

In [10]:
path

'/data/raw/comments/mensrightslaw-allcomments-2020-08-20'

In [121]:
# Change working directory to external drive
os.chdir("/Volumes/SAMSUNG/trpred")

from fsplit.filesplit import FileSplit

# Function will tell you file name, size in bytes, and line count
def func(f, s, c):
    print("file: {0}, size: {1}, count: {2}".format(f, s, c))

In [27]:
## Step 3: Split files into corresponding folder
for i in filenames:
    file_path = "data/raw/comments/" + i + ".json"
    folder_path = "data/raw/comments/" + i + "/"
    
    dir = os.listdir(folder_path) 
    
    # If folder is empty (i.e. file hasn't been split yet)...
    if len(dir) == 0:

        # ...then split file 
        fs = FileSplit(file = file_path, splitsize = 15000000, output_dir = folder_path)

        fs.split(callback = func)


In [53]:
# Now clean every small file for all subreddit folders

# Get folder names
subreddit_folders = [x[0] for x in os.walk("data/raw/comments")]
subreddit_folders = subreddit_folders[1:]


In [122]:
subreddit_folders

['data/raw/comments/AskFeminists-allcomments-2020-08-20',
 'data/raw/comments/Egalitarianism-allcomments-2020-08-20',
 'data/raw/comments/EthnicRedPill-allcomments-2020-08-20',
 'data/raw/comments/FeMRADebates-allcomments-2020-08-20',
 'data/raw/comments/GEOTRP-allcomments-2020-08-20',
 'data/raw/comments/IncelsWithoutHate-allcomments-2020-08-20',
 'data/raw/comments/MGTOW-allcomments-2020-08-20',
 'data/raw/comments/MRActivism-allcomments-2020-08-21',
 'data/raw/comments/MensRights-allcomments-2020-08-20',
 'data/raw/comments/NOMAAM-allcomments-2020-08-21',
 'data/raw/comments/PurplePillDebate-allcomments-2020-08-21',
 'data/raw/comments/RedPillLit-allcomments-2020-08-20',
 'data/raw/comments/RedPillNonMonogamy-allcomments-2020-08-20',
 'data/raw/comments/RedPillParenting-allcomments-2020-08-20',
 'data/raw/comments/RedPillWorkplace-allcomments-2020-08-20',
 'data/raw/comments/TRPOffTopic-allcomments-2020-08-20',
 'data/raw/comments/TRPmemes-allcomments-2020-08-20',
 'data/raw/comment

In [142]:
for i in subreddit_folders[29:]:
    # Pass in folder name to method
    clean_comments(i)
    print(i + " complete")
    
## Need to go back to r/TRP raw comments and see why they was a UnicodeDecodeError??? 
## r/seduction as well

data/raw/comments/thankTRP-allcomments-2020-08-20 complete
data/raw/comments/theRedPillLeft-allcomments-2020-08-20 complete


In [141]:
subreddit_folders[29:]

['data/raw/comments/thankTRP-allcomments-2020-08-20',
 'data/raw/comments/theRedPillLeft-allcomments-2020-08-20']