In [46]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import math
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from itertools import islice

In [None]:
# Reading the data
dataset = pd.read_json('data/renttherunway_final_data.json.gz', lines=True)
dataset = dataset.dropna()

In [None]:
# Converting the data into list of dictionaries
data = pd.DataFrame(dataset).to_dict('record')

In [None]:
for d in data:
    d['weight'] = int(d['weight'].split('lbs')[0])
    if(int(d['height'].split(' ')[1].split('"')[0])<10):
        height2 = int(d['height'].split(' ')[1].split('"')[0])*10
    else:
        height2 = int(d['height'].split(' ')[1].split('"')[0])
    d['height'] = int(d['height'][0][0])*100+height2

In [None]:
catogeries = ['rented for','body type']
for cat in catogeries:
    categories_list = defaultdict(int)
    for d in data:
        categories_list[d[cat]] += 1
        
    categories_id = defaultdict(int)

    i = 0
    for cID in  categories_list:
        categories_id[cID] = i
        i+=1
    for d in data:
        f = [0]*len(categories_list)
        f[categories_id[d[cat]]] = 1
        d[cat] = f[:len(categories_list)-1]

In [None]:
# train_data, validation_data, test_data = np.split(data(frac=1, random_state=42), 
#                                                   [int(.7*len(data)), int(.85*len(data))])

In [None]:
# Converting the data into list of dictionaries
# train_data = pd.DataFrame(train_data).to_dict('record')
# validation_data = pd.DataFrame(validation_data).to_dict('record')
# test_data = pd.DataFrame(test_data).to_dict('record')

In [None]:
total_count = len(data)
seclist = [int(0.7*total_count), int(0.15*total_count), total_count-int(0.85*total_count)]
it = iter(data)
sliced =[list(islice(it, 0, i)) for i in seclist]
train_data = sliced[0]
validation_data = sliced[1]
test_data = sliced[2]

In [None]:
# Extracting features from the data
def feature(d):
    f = [1, d['rating'], len(d['review_text']), len(d['review_summary']), d['height'], d['weight']] + d['rented for'] + d['body type']
    return f

In [None]:
# Generating train features
X_train = [feature(d) for d in train_data]
y_train = [d['fit'] for d in train_data]

In [None]:
sampling_strategy = "not majority"
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [None]:
# pipe = make_pipeline(StandardScaler(), MinMaxScaler(), LogisticRegression(random_state=0, multi_class='multinomial', class_weight='balanced'))

# pipe = make_pipeline(svm.SVC(random_state=0, decision_function_shape='ovo', class_weight='balanced'))

pipe = make_pipeline(StandardScaler(), MinMaxScaler(), 
                     RandomForestClassifier(random_state=0, n_estimators = 10, 
                                            criterion = 'entropy', class_weight='balanced',
                                            bootstrap=True, max_depth=65, max_features='auto'))

clf = pipe.fit(X_train, y_train)  # apply scaling on training data

In [None]:
y_pred_train = clf.predict(X_train)

# Measuring performace of train data
train_accuracy = accuracy_score(y_train, y_pred_train)
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
train_precision_score = precision_score(y_train, y_pred_train, average='weighted')
train_recall_score = recall_score(y_train, y_pred_train, average='weighted')

cm = confusion_matrix(y_train, y_pred_train)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
train_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))

print('Train Accuracy:', train_accuracy)
print('Train F1 Score:', train_f1_score)
print('Train Precision Score:', train_precision_score)
print('Train Recall Score:', train_recall_score)
print('Train G-Mean Score:', train_g_mean)

In [None]:
# Generating test features
X_test = [feature(d) for d in test_data]
y_test = [d['fit'] for d in test_data]
y_pred_test = clf.predict(X_test)

# Measuring performace of test data
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')
test_precision_score = precision_score(y_test, y_pred_test, average='weighted')
test_recall_score = recall_score(y_test, y_pred_test, average='weighted')

cm = confusion_matrix(y_test, y_pred_test)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
test_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))
print('Test Accuracy:', test_accuracy)
print('Test F1 Score:', test_f1_score)
print('Test Precision Score:', test_precision_score)
print('Test Recall Score:', test_recall_score)
print('Test G-Mean Score:', test_g_mean)

In [None]:
# Generating validation features
X_val = [feature(d) for d in validation_data]
y_val = [d['fit'] for d in validation_data]
y_pred_val = clf.predict(X_val)

# Measuring performace of validation data
val_accuracy = accuracy_score(y_val, y_pred_val)
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')
val_precision_score = precision_score(y_test, y_pred_test, average='weighted')
val_recall_score = recall_score(y_test, y_pred_test, average='weighted')

cm = confusion_matrix(y_val, y_pred_val)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
val_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))
print('Validation Accuracy:', val_accuracy)
print('Validation F1 Score:', val_f1_score)
print('Validation Precision Score:', val_precision_score)
print('Validation Recall Score:', val_recall_score)
print('Validation G-Mean Score:', val_g_mean)