In [57]:
import opendatasets as od
import kaggle
from os.path import exists
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split


In [42]:
ORIGINAL_DATA_DIRECTORY = 'clothing-fit-dataset-for-size-recommendation'
DATA_DIRECTORY = 'data'
ORIGINAL_MODCLOTH_FILE = 'modcloth_final_data.json'
ORIGINAL_RENTTHERUNWAY_FILE = 'renttherunway_final_data.json'
PROCESSED_MODCLOTH_FILE = 'modcloth.csv'
PROCESSED_RENTTHERUNWAY_FILE = 'renttherunway.csv'


In [43]:
def download_dataset():
    od.download('https://www.kaggle.com/datasets/rmisra/clothing-fit-dataset-for-size-recommendation?select=modcloth_final_data.json')

In [44]:
def is_downloaded(dataset = 'both'):
    if dataset == 'modcloth':
        return exists(f"{ORIGINAL_DATA_DIRECTORY}/{ORIGINAL_MODCLOTH_FILE}")
    if dataset == 'renttherunway':
        return exists(f"{ORIGINAL_DATA_DIRECTORY}/{ORIGINAL_RENTTHERUNWAY_FILE}")
    if dataset == 'both':
        return is_downloaded('modcloth') and is_downloaded('renttherunway')
    else:
        raise ValueError(f"Unknown dataset {dataset}")

In [45]:
def load_original_data(dataset = 'renttherunway'):
    if dataset == 'renttherunway':
        return pd.read_json(f"{ORIGINAL_DATA_DIRECTORY}/{ORIGINAL_RENTTHERUNWAY_FILE}", lines=True)
    if dataset == 'modcloth':
        return pd.read_json(f"{ORIGINAL_DATA_DIRECTORY}/{ORIGINAL_MODCLOTH_FILE}", lines=True)
    else:
        raise ValueError("unknown dataset {dataset}")


In [47]:
def reindex_column(df, column_name):
    result_df = df.sort_values(column_name)
    result_column = []
    current_new, current_old = 0, result_df[column_name][0]
    for i, row in enumerate(result_df[column_name]):
        if row != current_old:
            current_old = row
            if i != 0:
                current_new+=1
        result_column.append(current_new)
    result_df = result_df.rename(columns = {column_name: column_name+"_original"})
    result_df[column_name]= result_column
    return result_df

In [48]:
def prepare_renttherunway_df(original_df):
    chosen_columns = ["result", "user_id", "item_id", "size", "review_date", "category"]
    df = original_df.rename(columns={"fit":"result"})[chosen_columns]
    df = reindex_column(df, "user_id")
    df = reindex_column(df, "item_id")
    df = reindex_column(df, "result")
    return df.sort_index()

In [50]:
def preprocess_renttherunway_data():
    if not is_downloaded('renttherunway'):
        download_dataset()
    df = load_original_data('renttherunway')
    df = prepare_renttherunway_df(df)
    df.to_csv(f"{DATA_DIRECTORY}/{PROCESSED_RENTTHERUNWAY_FILE}", index=False)
    

In [53]:
def get_processed_renttherunway_data():
    datapath = f"{DATA_DIRECTORY}/{PROCESSED_RENTTHERUNWAY_FILE}"
    if not exists(datapath):
        preprocess_renttherunway_data()
    return pd.read_csv(datapath)
        

In [56]:
df = get_processed_renttherunway_data()
df

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: Your Kaggle Key: Downloading clothing-fit-dataset-for-size-recommendation.zip to ./clothing-fit-dataset-for-size-recommendation


100%|██████████| 39.7M/39.7M [00:10<00:00, 3.88MB/s]





Unnamed: 0,result_old,user_id_old,item_id_old,size,review_date,category,user_id,item_id,result
0,fit,420272,2260466,14,"April 20, 2016",romper,44334,4396,0
1,fit,273551,153475,12,"June 18, 2013",gown,28835,65,0
2,fit,360448,1063761,4,"December 14, 2015",sheath,37976,1945,0
3,fit,909926,126335,8,"February 12, 2014",dress,96080,7,0
4,fit,151944,616682,12,"September 26, 2016",gown,15959,1032,0
...,...,...,...,...,...,...,...,...,...
192539,fit,66386,2252812,8,"May 18, 2016",jumpsuit,7026,4382,0
192540,fit,118398,682043,4,"September 30, 2016",dress,12494,1164,0
192541,fit,47002,683251,8,"March 4, 2016",dress,5019,1166,0
192542,fit,961120,126335,16,"November 25, 2015",dress,101534,7,0


In [58]:
def split_renttherunway_data():
    df = get_processed_renttherunway_data()
    train, test = train_test_split(df, test_size=0.10, random_state=2022)
    train.to_csv(f"{DATA_DIRECTORY}/train_{PROCESSED_RENTTHERUNWAY_FILE}", index=False)
    test.to_csv(f"{DATA_DIRECTORY}/test_{PROCESSED_RENTTHERUNWAY_FILE}", index=False)

In [60]:
def get_test_runttherunway_data():
    datapath = f"{DATA_DIRECTORY}/test_{PROCESSED_RENTTHERUNWAY_FILE}"
    if not exists(datapath):
        split_renttherunway_data()
    return pd.read_csv(datapath)

def get_train_runttherunway_data():
    datapath = f"{DATA_DIRECTORY}/train_{PROCESSED_RENTTHERUNWAY_FILE}"
    if not exists(datapath):
        split_renttherunway_data()
    return pd.read_csv(datapath)


In [61]:
get_train_runttherunway_data()

Unnamed: 0,result_old,user_id_old,item_id_old,size,review_date,category,user_id,item_id,result
0,fit,282796,914136,8,"December 10, 2014",dress,29816,1627,0
1,fit,297296,576000,15,"July 7, 2016",gown,31337,942,0
2,fit,679803,131117,3,"May 12, 2016",gown,71581,16,0
3,fit,771751,1261393,14,"September 23, 2015",dress,81370,2353,0
4,fit,984200,987536,7,"March 6, 2017",gown,103908,1775,0
...,...,...,...,...,...,...,...,...,...
173284,fit,262124,784810,14,"October 6, 2016",dress,27547,1368,0
173285,small,779536,1595305,16,"August 2, 2016",dress,82194,3051,2
173286,fit,617032,1780063,4,"September 16, 2016",dress,64950,3431,0
173287,fit,64058,2484886,8,"December 27, 2017",sweater,6778,4863,0


In [62]:
get_test_runttherunway_data()

Unnamed: 0,result_old,user_id_old,item_id_old,size,review_date,category,user_id,item_id,result
0,large,713854,265806,4,"March 5, 2017",maxi,75142,299,1
1,small,643892,1461623,12,"May 23, 2016",gown,67814,2777,2
2,large,874800,132738,9,"January 23, 2013",gown,92369,20,1
3,fit,999308,2586703,4,"May 17, 2016",romper,105490,5075,0
4,fit,304020,2249807,4,"September 17, 2017",top,32077,4376,0
...,...,...,...,...,...,...,...,...,...
19250,fit,161858,1229740,24,"April 7, 2015",gown,17012,2291,0
19251,fit,159693,704314,12,"March 28, 2017",dress,16788,1208,0
19252,fit,468727,135750,1,"November 10, 2013",shift,49374,26,0
19253,fit,147361,544038,16,"November 13, 2017",dress,15486,876,0
