In [1]:
import pandas as pd
import json
from random import randrange
import random

In [2]:
def ahole_binary(flag):
    if flag == 'Not the A-hole':
        return 0
    else:
        return 1

In [3]:
#fn: Randomly split dataset into training and test:
def split(sample_data):
    #Shuffle samples
    randomized = sample_data.sample(frac=1, random_state=44074)
    num_samples = randomized.shape[0]
    #percent training data
    percent_training = .80
    percent_testing = .20
    num_training = int(num_samples * percent_training)
    train = randomized.iloc[:num_training]
    test = randomized.iloc[num_training:, :]
    trainingX = train.drop("link_flair_text", axis=1)
    trainingy = train.drop("self_text", axis=1)
    testingX = test.drop("link_flair_text", axis=1)
    testingy = test.drop("self_text", axis=1)
    print(sample_data.size)
    print(len(trainingX))
    print(len(trainingy))
    print(len(testingX))
    print(len(testingy))

    return trainingX, trainingy, testingX, testingy

In [4]:
import os

import shutil
def new_txts(testx, testy, path):
    #fn: Reformat data into folders
    for file in range(len(testx)):
        filename = 'sample' + '_'+ str(file) + ".txt"

        if testy.iloc[file].loc['link_flair_text'] == 'Asshole':
            destination = f'../{path}/pos/{filename}'
        else:
            destination = f'../{path}/neg/{filename}'

        with open(destination, "w+") as out_file:
            out_file.write(testx.iloc[file].loc['self_text'])     

In [5]:
training_df = pd.read_csv('../data_raw/standardized/binary_train.csv')
testing_df = pd.read_csv('../data_raw/standardized/binary_test.csv')

In [6]:
# remerge these two since the rest of the script handles train test splitting and already and train test splitting on an already split training set will only reduce the training size
df = pd.concat([training_df, testing_df])

In [7]:
# verify that the remerge does not duplicate posts
df.value_counts('post_id')

post_id
10002pk    1
p7o5lw     1
p7ox8r     1
p7oqel     1
p7omz0     1
          ..
f0lfl3     1
f0ldnr     1
f0lc29     1
f0lb6a     1
zzzyio     1
Name: count, Length: 382046, dtype: int64

In [8]:
sample_data = df[df['link_flair_text'].apply(lambda x: x in ['Not the A-hole', 'Asshole'])][['link_flair_text', 'self_text']]

In [9]:
sample_data.head()

Unnamed: 0,link_flair_text,self_text
0,Not the A-hole,So in accordance with the increasing awareness...
1,Not the A-hole,"My aunt is going for surgery today, so yesterd..."
2,Asshole,My daughter takes commissions from people some...
3,Not the A-hole,I was in a local cafe waiting for a smoothie I...
4,Not the A-hole,"We do not get along. Days before the surgery, ..."


In [10]:
# verify that, as we thought, ahole and not the ahole are out of balance
sample_data.value_counts('link_flair_text')

link_flair_text
Not the A-hole    303555
Asshole            78491
Name: count, dtype: int64

In [11]:
ahole_df = sample_data[sample_data['link_flair_text'] == 'Asshole'].reset_index(drop=True)
nta_df = sample_data[sample_data['link_flair_text'] == 'Not the A-hole'].reset_index(drop=True)

In [12]:
unbalanced_sample_data = pd.concat([ahole_df, nta_df])

In [13]:
# select a random subset of NTA rows equal to the total number of ahole rows, in order to balance the dataset 50/50
sample_rows = random.sample(list(nta_df.index), len(ahole_df))

balanced_nta_df = nta_df.iloc[sample_rows].reset_index(drop=True)

# verify that the dataset is now balanced
if len(balanced_nta_df) != len(ahole_df):
    raise ValueError('Dataset imbalance')

balanced_sample_data = pd.concat([ahole_df, balanced_nta_df])

In [14]:
#Split sample_data into testing and training sets
trainingX, trainingy, testingX, testingy = split(unbalanced_sample_data)
#Ensure proper format
trainingX.head()
#trainingy.head()

764092
305636
305636
76410
76410


Unnamed: 0,self_text
83574,"I (24F) have nine siblings, 14, 15, 17, 20, 23..."
62158,I live in a house that is owned by my gf and o...
43079,I (15f) have recently started therapy again. T...
75327,"Firstly, I want to apologize for formatting si..."
75458,"Ok so I'm not sure the title is the best, so l..."


In [15]:
new_txts(pd.concat([testingX, trainingX]), pd.concat([testingy, trainingy]), 'data_transformer/balanced')

In [16]:
trainingX, trainingy, testingX, testingy = split(unbalanced_sample_data)

764092
305636
305636
76410
76410


In [18]:
new_txts(pd.concat([testingX, trainingX]), pd.concat([testingy, trainingy]), 'data_transformer/unbalanced')

In [None]:
#Create 'test' and 'train' folders in 'data' that each contain 'pos' and 'neg' txt files 
new_txts(testingX, testingy, 'test', 'balanced')
new_txts(trainingX, trainingy, 'train', 'balanced')

313964
125585
125585
31397
31397


In [31]:
#Split sample_data into testing and training sets
trainingX, trainingy, testingX, testingy = split(unbalanced_sample_data)
#Ensure proper format
trainingX.head()
#trainingy.head()

#Create 'test' and 'train' folders in 'data' that each contain 'pos' and 'neg' txt files 
new_txts(testingX, testingy, 'test', 'unbalanced')
new_txts(trainingX, trainingy, 'train', 'unbalanced')

764092
305636
305636
76410
76410


In [32]:
#Check that the number of files in each directory is correct
#If there is 80% training data, there should be 76410 instances in test and 305636 instances in train
import os
for dirpath, dirnames, filenames in os.walk('../data_formatted/unbalanced/test/neg'):
        num_files = len(filenames)
        print(f"Folder: {dirpath}, Number of files: {num_files}")
for dirpath, dirnames, filenames in os.walk('../data_formatted/unbalanced/test/pos'):
        num_files = len(filenames)
        print(f"Folder: {dirpath}, Number of files: {num_files}")
for dirpath, dirnames, filenames in os.walk('../data_formatted/unbalanced/train/neg'):
        num_files = len(filenames)
        print(f"Folder: {dirpath}, Number of files: {num_files}")
for dirpath, dirnames, filenames in os.walk('../data_formatted/unbalanced/train/pos'):
        num_files = len(filenames)
        print(f"Folder: {dirpath}, Number of files: {num_files}")

Folder: ../data_formatted/unbalanced/test/neg, Number of files: 60691
Folder: ../data_formatted/unbalanced/test/pos, Number of files: 15720
Folder: ../data_formatted/unbalanced/train/neg, Number of files: 242864
Folder: ../data_formatted/unbalanced/train/pos, Number of files: 62773
