In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import math
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict



In [2]:
# Reading the data
dataset = pd.read_json('data/renttherunway_final_data.json.gz', lines=True)
dataset = dataset.dropna()
dataset = dataset.drop(dataset[dataset['rented for']== "party: cocktail"].index)

In [3]:
# Converting the data into list of dictionaries
data = pd.DataFrame(dataset).to_dict('record')

for d in data:
    d['weight'] = int(d['weight'].split('lbs')[0])
    if(int(d['height'].split(' ')[1].split('"')[0])<10):
        height2 = int(d['height'].split(' ')[1].split('"')[0])*10
    else:
        height2 = int(d['height'].split(' ')[1].split('"')[0])
    d['height'] = int(d['height'][0][0])*100+height2
    
# Converting categroies type to one hot
catogeries = ['rented for','body type']
for cat in catogeries:
    categories_list = defaultdict(int)
    for d in data:
        categories_list[d[cat]] += 1
        
    categories_id = defaultdict(int)

    i = 0
    for cID in  categories_list:
        categories_id[cID] = i
        i+=1
    for d in data:
        f = [0]*len(categories_list)
        f[categories_id[d[cat]]] = 1
        d[cat] = f[:len(categories_list)-1]

  


In [4]:
# Converting the data into list of dictionaries
train_data = data[0:int(0.8*len(data))]
test_data = data[int(0.8*len(data)):]

In [5]:
train_data[0]

{'fit': 'fit',
 'user_id': 420272,
 'bust size': '34d',
 'item_id': 2260466,
 'weight': 137,
 'rating': 10.0,
 'rented for': [1, 0, 0, 0, 0, 0, 0],
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': [1, 0, 0, 0, 0, 0],
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': 580,
 'size': 14,
 'age': 28.0,
 'review_date': 'April 20, 2016'}

In [6]:
# Extracting features from the data
def feature(d):
    f = [1, d['rating'], len(d['review_text']), len(d['review_summary']), d['weight'], d['height'], d['bust size'][0:2]] + d['rented for'] + d['body type']
    return f

In [7]:
# Generating train features
X_train = [feature(d) for d in train_data]
y_train = [d['fit'] for d in train_data]

In [8]:
sampling_strategy = "not majority"
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X_train, y_train)

In [9]:
# pipe = make_pipeline(StandardScaler(), MinMaxScaler(), LogisticRegression(random_state=0, multi_class='multinomial', class_weight='balanced'))

pipe = make_pipeline(StandardScaler(), MinMaxScaler(), RandomForestClassifier(criterion='entropy', random_state=42, class_weight='balanced'))

clf = pipe.fit(X_train, y_train)

In [10]:
y_pred_train = clf.predict(X_train)

# Measuring performace of train data
train_accuracy = accuracy_score(y_train, y_pred_train)
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
train_precision_score = precision_score(y_train, y_pred_train, average='weighted')
train_recall_score = recall_score(y_train, y_pred_train, average='weighted')

cm = confusion_matrix(y_train, y_pred_train)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
train_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))

print('Train Accuracy:', train_accuracy)
print('Train F1 Score:', train_f1_score)
print('Train Precision Score:', train_precision_score)
print('Train Recall Score:', train_recall_score)
print('Train G-Mean Score:', train_g_mean)

Train Accuracy: 0.9997352780434485
Train F1 Score: 0.9997353075027428
Train Precision Score: 0.9997353948668175
Train Recall Score: 0.9997352780434485
Train G-Mean Score: 0.9998706640490908


In [11]:
# Generating test features
X_test = [feature(d) for d in test_data]
y_test = [d['fit'] for d in test_data]
y_pred_test = clf.predict(X_test)

# Measuring performace of test data
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')
test_precision_score = precision_score(y_test, y_pred_test, average='weighted')
test_recall_score = recall_score(y_test, y_pred_test, average='weighted')

cm = confusion_matrix(y_test, y_pred_test)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
test_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))

print('Test Accuracy:', test_accuracy)
print('Test F1 Score:', test_f1_score)
print('Test Precision Score:', test_precision_score)
print('Test Recall Score:', test_recall_score)
print('Test G-Mean Score:', test_g_mean)

Test Accuracy: 0.72854898210138
Test F1 Score: 0.6468524837691504
Test Precision Score: 0.6336589504671043
Test Recall Score: 0.72854898210138
Test G-Mean Score: 0.2355088173678689
