#### Split main CSV into 5 folds (for all confidence values). Each fold has a train, validation (for early stopping), and test set.

In [1]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import numpy as np


def split_csv(train_ids, val_ids, test_ids, fold_number, out_path):
    """
    Splits the DataFrame (which corresponds to a certain confidence value) on the input train, val, and test IDs.
    Creates three CSVs (in out_path) that correspond to the given fold_number.
    """
    train_df = DF.loc[DF['reply_id'].isin(train_ids)]
    print("Train DF: Shape = {} | Unpalatable Ratio = {}".format(train_df.shape, check_ratio(train_df)))
    val_df = DF.loc[DF['reply_id'].isin(val_ids)]
    print("Validation DF: Shape = {} | Unpalatable Ratio = {}".format(val_df.shape, check_ratio(val_df)))
    test_df = DF.loc[DF['reply_id'].isin(test_ids)]
    print("Test DF: Shape = {} | Unpalatable Ratio = {}".format(test_df.shape, check_ratio(test_df)))
    
    train_df.to_csv(out_path+'train_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    val_df.to_csv(out_path+'val_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    test_df.to_csv(out_path+'test_data_fold_'+str(fold_number)+'.csv', line_terminator='\n', index=False)
    
    
def check_ratio(df):
    """
    Returns % unpalatable questions in the input DataFrame
    """
    return df['label'].value_counts().to_dict()['yes_unpalatable'] / df.shape[0]

In [2]:
N_SPLITS = 5
DATA_PATH = '/path/unpalatable-questions/crowdsourcing/annotations/annotations_UQ.csv'

SEED = 42 # random seed for splits (same across all experiments)

In [3]:
for conf in [0.6, 0.8, 1.0]:
    DF = pd.read_csv(DATA_PATH, lineterminator='\n')
    DF = DF.loc[DF['confidence']>=conf] # filter by confidence
    print("\n\nFor {} confidence, shape is {}".format(conf, DF.shape))
    
    X = DF['reply_id'].tolist()
    y = []
    for i in DF['label'].tolist():
        if i == 'yes_unpalatable': y.append(1)
        elif i == 'not_unpalatable': y.append(0)
        else: print("CodingError: ", i)
    X = np.array(X)
    y = np.array(y)
    print(len(X), len(y))
    
    
    out_path = '/path/unpalatable-questions/data-folds/confidence-'+str(int(conf*100))+'/'
    skf = StratifiedKFold(N_SPLITS, random_state=SEED)

    fold_number = 1
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index] # Don't need y_test because test-set is fixed
        print("\nBIG X_train: ", len(X_train))
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train,
                                                         random_state=SEED)
        print("X_train: ", len(X_train), "X_val: ", len(X_val), "X_test: ", len(X_test))
        
        split_csv(X_train, X_val, X_test, fold_number, out_path)
        fold_number += 1



For 0.6 confidence, shape is (10909, 8)
10909 10909

BIG X_train:  8726
X_train:  7853 X_val:  873 X_test:  2183
Train DF: Shape = (7853, 8) | Unpalatable Ratio = 0.17572902075639882
Validation DF: Shape = (873, 8) | Unpalatable Ratio = 0.17525773195876287
Test DF: Shape = (2183, 8) | Unpalatable Ratio = 0.17590471827759963

BIG X_train:  8726
X_train:  7853 X_val:  873 X_test:  2183
Train DF: Shape = (7853, 8) | Unpalatable Ratio = 0.17572902075639882
Validation DF: Shape = (873, 8) | Unpalatable Ratio = 0.17525773195876287
Test DF: Shape = (2183, 8) | Unpalatable Ratio = 0.17590471827759963

BIG X_train:  8728
X_train:  7855 X_val:  873 X_test:  2181
Train DF: Shape = (7855, 8) | Unpalatable Ratio = 0.1758115849777212
Validation DF: Shape = (873, 8) | Unpalatable Ratio = 0.17525773195876287
Test DF: Shape = (2181, 8) | Unpalatable Ratio = 0.1756075194864741

BIG X_train:  8728
X_train:  7855 X_val:  873 X_test:  2181
Train DF: Shape = (7855, 8) | Unpalatable Ratio = 0.1758115849777

- Ratio is preserved across the folds/CSVs for all confidence values:

In [1]:
import os
import pandas as pd

N_SPLITS = 5
for conf in [0.6, 0.8, 1.0]:
    print("\nConfidence = {}".format(conf))
    path = '/path/unpalatable-questions/data-folds/confidence-'+str(int(conf*100))+'/'
    for fold_number in range(1,N_SPLITS+1):
        df1 = pd.read_csv(path+'train_data_fold_'+str(fold_number)+'.csv', lineterminator='\n')
        df2 = pd.read_csv(path+'val_data_fold_'+str(fold_number)+'.csv', lineterminator='\n')
        df3 = pd.read_csv(path+'test_data_fold_'+str(fold_number)+'.csv', lineterminator='\n')
        df = pd.concat([df1, df2, df3])
        print("Fold #{}, the combined shape of train+val+test is {}".format(fold_number, df.shape))


Confidence = 0.6
Fold #1, the combined shape of train+val+test is (10909, 8)
Fold #2, the combined shape of train+val+test is (10909, 8)
Fold #3, the combined shape of train+val+test is (10909, 8)
Fold #4, the combined shape of train+val+test is (10909, 8)
Fold #5, the combined shape of train+val+test is (10909, 8)

Confidence = 0.8
Fold #1, the combined shape of train+val+test is (8706, 8)
Fold #2, the combined shape of train+val+test is (8706, 8)
Fold #3, the combined shape of train+val+test is (8706, 8)
Fold #4, the combined shape of train+val+test is (8706, 8)
Fold #5, the combined shape of train+val+test is (8706, 8)

Confidence = 1.0
Fold #1, the combined shape of train+val+test is (5735, 8)
Fold #2, the combined shape of train+val+test is (5735, 8)
Fold #3, the combined shape of train+val+test is (5735, 8)
Fold #4, the combined shape of train+val+test is (5735, 8)
Fold #5, the combined shape of train+val+test is (5735, 8)


#### Convert label to one-hot vector

In [1]:
import pandas as pd
import os

def convert_to_onehot(fname, dir_path):
    """
    Converts the given CSV's DataFrame to have one-hot labels, and writes it to a new CSV in the same directory.
    """
    df = pd.read_csv(dir_path+fname, lineterminator='\n')

    new_df = pd.concat([df, pd.get_dummies(df['label'])],axis=1)
    new_df.drop('label', axis=1, inplace=True)
    
    new_fname = fname.split('.')[0]+'_OneHot.csv'
    new_df.to_csv(dir_path+new_fname, line_terminator='\n', index=False)

In [2]:
main_dir = '/path/unpalatable-questions/data-folds/'

for directory in os.listdir(main_dir):
    inner_dir = main_dir+directory+'/'
    for fname in os.listdir(inner_dir):
        convert_to_onehot(fname, inner_dir)

In [3]:
# Random check:
blah = pd.read_csv('/path/unpalatable-questions/data-folds/confidence-60/\
test_data_fold_2_OneHot.csv', lineterminator='\n')
print(blah.shape)
blah.head()

(2183, 9)


Unnamed: 0,question,reply_text,comment_text,comment_id,reply_id,subreddit,confidence,not_unpalatable,yes_unpalatable
0,Why do you think the establishment is fighting...,Why do you think the establishment is fighting...,Having a President with balls is going to be a...,d18sfl1,d192jvs,The_Donald,1.0,1,0
1,Isn't that what we all mean?,Isn't that what we all mean?,That's what he meant with 'lovers'.,czq7ogg,czq9ion,ImGoingToHellForThis,1.0,1,0
2,thats not a thing?,What..... thats not a thing?,doesnt go to anything.,czxdbo5,czxe2kq,ImGoingToHellForThis,1.0,1,0
3,"Ugh, right?","Ugh, right? That's why I'm gonna cheer for Tea...",Having to root for Patrick Kane makes me feel ...,d29m4ll,d29m98c,hockey,1.0,1,0
4,Could have just been a sweet dad joke?,Could have just been a sweet dad joke?,Best personal take away here is that I and my ...,czv9g94,czvcwl8,politics,1.0,1,0


In [4]:
# Random check:
blah = pd.read_csv('/path/unpalatable-questions/data-folds/confidence-100/\
train_data_fold_5_OneHot.csv', lineterminator='\n')
print(blah.shape)
blah.head()

(4130, 9)


Unnamed: 0,question,reply_text,comment_text,comment_id,reply_id,subreddit,confidence,not_unpalatable,yes_unpalatable
0,Is high score bad?,Is high score bad? haha,He scored 14 goals!? He broke the 3 goal limit...,czy0u0n,czykv63,sports,1.0,1,0
1,The Cavs or all of us?,The Cavs or all of us?,He's gotta be trolling at this point,d18cwr9,d18ebdl,nba,1.0,1,0
2,"I drive a cobalt, am I going to get blue hair?","I drive a cobalt, am I going to get blue hair?",I've never associated blue hair with cobalt ex...,d1ox7zz,d1q5sa4,ImGoingToHellForThis,1.0,1,0
3,is that what we are calling the Batum signing?,is that what we are calling the Batum signing?,I'm scared of the team who hasn't won a playof...,d22tr7y,d23w88k,nba,1.0,1,0
4,Isn't that only the median income because of t...,Isn't that only the median income because of t...,"Well to be fair, it isn't that uncommon. Media...",d127yb1,d12cikg,cringepics,1.0,1,0


# fin.