In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import math
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm



In [2]:
# Reading the data
dataset = pd.read_json('data/renttherunway_final_data.json.gz', lines=True)
dataset = dataset.dropna()

In [3]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=42), 
                                                  [int(.7*len(dataset)), int(.85*len(dataset))])

In [4]:
# Converting the data into list of dictionaries
train_data = pd.DataFrame(train_data).to_dict('record')
validation_data = pd.DataFrame(validation_data).to_dict('record')
test_data = pd.DataFrame(test_data).to_dict('record')

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [5]:
# Extracting features from the data
def feature(d):
    f = [1, d['rating'], len(d['review_text']), len(d['review_summary'])]
    return f

In [6]:
# Generating train features
X_train = [feature(d) for d in train_data]
y_train = [d['fit'] for d in train_data]

c1 = 0
for i in y_train:
    if (i=='large'):
        c1+=1
print(c1)

c1 = 0
for i in y_train:
    if (i=='fit'):
        c1+=1
print(c1)

c1 = 0
for i in y_train:
    if (i=='small'):
        c1+=1
print(c1)

13303
75464
13699


In [7]:
sampling_strategy = "not majority"
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X_train, y_train)

c1 = 0
for i in y_res:
    if (i=='large'):
        c1+=1
print(c1)

c1 = 0
for i in y_res:
    if (i=='fit'):
        c1+=1
print(c1)

c1 = 0
for i in y_res:
    if (i=='small'):
        c1+=1
print(c1)

75464
75464
75464


In [None]:
# pipe = make_pipeline(StandardScaler(), MinMaxScaler(), LogisticRegression(random_state=0, multi_class='multinomial', class_weight='balanced'))

pipe = make_pipeline(StandardScaler(), MinMaxScaler(), svm.SVC(random_state=0, decision_function_shape='ovo', class_weight='balanced'))

clf = pipe.fit(X_train, y_train)  # apply scaling on training data

In [None]:
y_pred_train = clf.predict(X_train)

# Measuring performace of train data
train_accuracy = accuracy_score(y_train, y_pred_train)
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
train_precision_score = precision_score(y_train, y_pred_train, average='weighted')
train_recall_score = recall_score(y_train, y_pred_train, average='weighted')

cm = confusion_matrix(y_train, y_pred_train)
TN = cm[0][0]
FN = cm[1][0]
TP = cm[1][1]
FP = cm[0][1]
train_g_mean = math.sqrt(TP*TN*1.0/((TP+FN)*(TN+FP)))

print('Train Accuracy:', train_accuracy)
print('Train F1 Score:', train_f1_score)
print('Train Precision Score:', train_precision_score)
print('Train Recall Score:', train_recall_score)
print('Train G-Mean Score:', train_g_mean)

In [None]:
# Generating test features
X_test = [feature(d) for d in test_data]
y_test = [d['fit'] for d in test_data]
y_pred_test = clf.predict(X_test)

# Data Analysis / Data Statisics - To be used for question-1
print(set(y_test))
print(set(y_pred_test))

print(len(y_pred_test))
count = 0
for i in y_pred_test:
    if i == 'fit':
        count += 1
print(count)

count = 0
for i in y_test:
    if i == 'fit':
        count += 1
print(count)

# Measuring performace of test data
test_accuracy = accuracy_score(y_test, y_pred_test)
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')
print('Test Accuracy:', test_accuracy)
print('Test F1 Score:', test_f1_score)

In [None]:
# Generating validation features
X_val = [feature(d) for d in validation_data]
y_val = [d['fit'] for d in validation_data]
y_pred_val = clf.predict(X_val)

# Measuring performace of validation data
val_accuracy = accuracy_score(y_val, y_pred_val)
val_f1_score = f1_score(y_val, y_pred_val, average='weighted')
print('Validation Accuracy:', val_accuracy)
print('Validation F1 Score:', test_f1_score)