In [3]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
import numpy
import random
import gzip
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from collections import Counter

In [4]:
f = gzip.open("cleaned_dataset.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))
dataset = dataset[0]

In [5]:
dataset[0]

{'fit': 'fit',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': 137,
 'rating': 10,
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': 68,
 'size': 14,
 'age': 28,
 'fit_numeric': 0}

In [6]:
import sklearn
from sklearn.model_selection import train_test_split

dataTrain, dataTest = train_test_split(dataset, test_size= 0.2, random_state=42)

print(len(dataTrain))
print(len(dataTest))

117104
29277


In [7]:
# one hot encoding body type

unique_body_types = list(set(d['body type'] for d in dataTrain))
body_type_to_onehot = {bt: [1 if i == idx else 0 for i in range(len(unique_body_types))] for idx, bt in enumerate(unique_body_types)}

In [8]:
# using the frequency of each item id as input

item_id_counts = Counter(d['item_id'] for d in dataTrain)
total_items = sum(item_id_counts.values())

item_id_to_frequency = {item: count / total_items for item, count in item_id_counts.items()}


In [9]:
def feat1(datum):
    
    onehot_body_type = body_type_to_onehot.get(datum['body type'], [0]*len(unique_body_types))
    
    frequency_item_id = item_id_to_frequency.get(datum['item_id'], 0)
    
    height = datum['height']
    weight = datum['weight']
    size = datum['size']
    
    return [1] + [height] + [weight] + [size] + onehot_body_type + [frequency_item_id]

In [10]:
X_train = [feat1(d) for d in dataTrain]
y_train = [d['fit_numeric'] for d in dataTrain]

X_test = [feat1(d) for d in dataTest]
y_test = [d['fit_numeric'] for d in dataTest]


In [11]:
model = LogisticRegression(max_iter=500, solver = 'saga', verbose=1)
model.fit(X_train, y_train)

Epoch 1, change: 1.00000000
Epoch 2, change: 0.08188559
Epoch 3, change: 0.05915669
Epoch 4, change: 0.04740498
Epoch 5, change: 0.04704948
Epoch 6, change: 0.04411341
Epoch 7, change: 0.04120418
Epoch 8, change: 0.03875913
Epoch 9, change: 0.03634364
Epoch 10, change: 0.03384640
Epoch 11, change: 0.03206968
Epoch 12, change: 0.03122196
Epoch 13, change: 0.03027993
Epoch 14, change: 0.02941278
Epoch 15, change: 0.02858526
Epoch 16, change: 0.02773625
Epoch 17, change: 0.02695197
Epoch 18, change: 0.02617101
Epoch 19, change: 0.02539001
Epoch 20, change: 0.02465991
Epoch 21, change: 0.02392798
Epoch 22, change: 0.02324242
Epoch 23, change: 0.02257550
Epoch 24, change: 0.02190583
Epoch 25, change: 0.02128039
Epoch 26, change: 0.02065488
Epoch 27, change: 0.02004769
Epoch 28, change: 0.01946661
Epoch 29, change: 0.01888221
Epoch 30, change: 0.01834202
Epoch 31, change: 0.01780334
Epoch 32, change: 0.01729904
Epoch 33, change: 0.01677432
Epoch 34, change: 0.01626876
Epoch 35, change: 0.015



In [12]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy", train_accuracy)
print(f"Test Accuracy", test_accuracy)

Training Accuracy 0.7349535455663342
Test Accuracy 0.7348430508590361
