In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb


In [2]:
def make_train_df(rating,clothes,user):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    encoder = OneHotEncoder(sparse_output=False)
    #print(df.head())
    df_encoded = encoder.fit_transform(df.loc[:,'r_gender':'분위기'])
    
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded

    return df,df_rating,encoder

In [3]:
def make_test_df(rating,clothes,user,encoder):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    df_encoded = encoder.transform(df.loc[:,'r_gender':'분위기'])
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded
    return df,df_rating

In [4]:
rating = pd.read_csv('../preprocessed/TL_woman_rating_2019.csv')
clothes = pd.read_csv('../preprocessed/TL_woman_clothes_2019.csv',index_col='image')
user = pd.read_csv('../preprocessed/TL_woman_user_2019.csv',index_col='R_id')

train_x,train_y,encoder = make_train_df(rating,clothes,user)

In [5]:
train_x.to_csv('../train/train_x_woman.csv',index=False)
train_y.to_csv('../train/train_y_woman.csv',index=False)

In [6]:
with open('../encoder/onehot_encoder_woman.pkl','wb') as f:
    pickle.dump(encoder,f)

In [7]:
rating = pd.read_csv('../preprocessed/VL_woman_rating_2019.csv')
clothes = pd.read_csv('../preprocessed/VL_woman_clothes_2019.csv',index_col='image')
user = pd.read_csv('../preprocessed/VL_woman_user_2019.csv',index_col='R_id')

In [8]:
test_x,test_y = make_test_df(rating,clothes,user,encoder)

In [9]:
def map_to_binary(value):
    if value<3.0:
        return 0
    return 1

In [10]:
def precision_at_k(rating_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = pd.concat([rating_df,predict_df],axis=1)

    precisions = []

    for user in rating_df['R_id'].unique():
        if len(rating_df[rating_df['R_id']==user])>=2:
            pred = rating_df.loc[rating_df['R_id'] == user,['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
            pred_k = pred.head(k)
            precision = len(pred_k.loc[pred_k['선호여부']>=3.0]) / float(k)
            precisions.append(precision)
                
    return sum(precisions) / len(precisions)


def recall_at_k(rating_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    
    predict_df = pd.DataFrame({'예측': predict_value})
 
    rating_df = pd.concat([rating_df,predict_df],axis=1)
    recalls = []

  
    for user in rating_df['R_id'].unique():
        if len(rating_df[rating_df['R_id']==user])>=2:
            pred = rating_df.loc[rating_df['R_id']==user,['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
            pred_k = pred.head(k)
            recall = len(pred_k.loc[pred_k['선호여부']>=3.0]) / len(pred.loc[pred['선호여부']>=3.0]) if len(pred.loc[pred['선호여부']>=3.0])>0 else 0
            recalls.append(recall)

    return sum(recalls) / len(recalls)

In [11]:
def precision_at_k_by_personal(rating_,user_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    user_df = pd.read_csv(user_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = rating_df.merge(user_df,how='inner',on='R_id')
    rating_df = pd.concat([rating_df,predict_df],axis=1)

    precisions = []

    for color in rating_df['personal_color'].unique():
        for face in rating_df['faceshape'].unique():
            for body in rating_df['bodyshape'].unique():

                pred = rating_df.loc[(rating_df['personal_color']==color) & (rating_df['faceshape']==face) & (rating_df['bodyshape']==body),['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
                pred_k =pred.head(k)
                if len(pred_k)>0:
                    precision = len(pred_k.loc[pred_k['선호여부']>=3.0]) / float(k)
                    precisions.append(precision)

    return sum(precisions) / len(precisions)


def recall_at_k_by_personal(rating_,user_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    user_df = pd.read_csv(user_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = rating_df.merge(user_df,how='inner',on='R_id')
    rating_df = pd.concat([rating_df,predict_df],axis=1)
    recalls = []

    for color in rating_df['personal_color'].unique():
        for face in rating_df['faceshape'].unique():
            for body in rating_df['bodyshape'].unique():

                pred = rating_df.loc[(rating_df['personal_color']==color) & (rating_df['faceshape']==face) & (rating_df['bodyshape']==body),['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
                pred_k = pred.head(k)
                if len(pred_k)>0:
                    recall = len(pred_k.loc[pred_k['선호여부']>=3.0]) / len(pred.loc[pred['선호여부']>=3.0]) if len(pred.loc[pred['선호여부']>=3.0])>0 else 0
                    recalls.append(recall)

    return sum(recalls) / len(recalls)

In [12]:
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
def test_model(reg):
    
    train_predict = reg.predict(train_x)
    print("Train RMSE:{}".format(math.sqrt(mean_squared_error(train_predict, train_y))) )
    test_predict = reg.predict(test_x)
    print("RMSE':{}".format(math.sqrt(mean_squared_error(test_predict, test_y))) )

    

    train_predict_binary = np.vectorize(map_to_binary)(train_predict)
    train_y_binary = np.vectorize(map_to_binary)(train_y)
    test_predict_binary = np.vectorize(map_to_binary)(test_predict)
    test_y_binary = np.vectorize(map_to_binary)(test_y)
    print("train classification report")
    print(classification_report(train_y_binary,train_predict_binary))
    print("test classification report")
    print(classification_report(test_y_binary,test_predict_binary))
    k=10
    print(f"precision@{k} of train: ",precision_at_k('../preprocessed/TL_woman_rating_2019.csv',train_predict,k))
    print(f"recall@10 of train: ",recall_at_k('../preprocessed/TL_woman_rating_2019.csv',train_predict,k))
    
    print(f"precision@{k} of test: ",precision_at_k('../preprocessed/VL_woman_rating_2019.csv',test_predict,k))
    print(f"recall@{k} of test: ",recall_at_k('../preprocessed/VL_woman_rating_2019.csv',test_predict,k))

    print("PERSONAL PRECISION RECALL")
    k=10
    print(f"precision@{k} of train" , precision_at_k_by_personal('../preprocessed/TL_woman_rating_2019.csv','../preprocessed/TL_woman_user_2019.csv',train_predict,k))
    print(f"recall@{k} of train" , recall_at_k_by_personal('../preprocessed/TL_woman_rating_2019.csv','../preprocessed/TL_woman_user_2019.csv',train_predict,k))
    print(f"precision@{k} of test: ",precision_at_k_by_personal('../preprocessed/VL_woman_rating_2019.csv','../preprocessed/VL_woman_user_2019.csv',test_predict,k))
    print(f"recall@{k} of test: ",recall_at_k_by_personal('../preprocessed/VL_woman_rating_2019.csv','../preprocessed/VL_woman_user_2019.csv',test_predict,k))

In [13]:
models = {
    "XGBoost": xgb.XGBRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, gamma=0,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=500, learning_rate=0.05, num_leaves=31,
        max_depth=6, subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=5,
        min_samples_split=5, min_samples_leaf=2, subsample=0.8,
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=300, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        iterations=500, learning_rate=0.05, depth=6,
        l2_leaf_reg=1, verbose=0,subsample=0.8,random_state=0
    ),
}


In [14]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)
    

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.635439859114642
RMSE':0.7734359416458834
train classification report
              precision    recall  f1-score   support

           0       0.54      0.98      0.70      7516
           1       0.96      0.37      0.54     10027

    accuracy                           0.63     17543
   macro avg       0.75      0.68      0.62     17543
weighted avg       0.78      0.63      0.61     17543

test classification report
              precision    recall  f1-score   support

           0       0.49      0.92      0.64      1033
           1       0.84      0.29      0.43      1405

    accuracy                           0.56      2438
   macro avg       0.66      0.61      0.54      2438
weighted avg       0.69      0.56      0.52      2438

precision@10 of train:  0.3504745166959577
recall@10 of train:  0.9483883411216687
precision@10 of test:  0.13938356164383595
recall@10 of test:  0.839041095890411
PERSONAL PRECISION 

In [15]:
models = {
    "XGBoost": xgb.XGBRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, gamma=0,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=1000, learning_rate=0.05, num_leaves=31,
        max_depth=6, subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=5,
        min_samples_split=5, min_samples_leaf=2, subsample=0.8,
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=1000, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        iterations=1000, learning_rate=0.05, depth=6,
        l2_leaf_reg=1, subsample=0.8, verbose=0,random_state=0
    ),
    "DecisionTree": DecisionTreeRegressor(
        max_depth=10, min_samples_split=5, min_samples_leaf=2,
        random_state=0
    )
}

In [16]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)
    

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.5589698172254809
RMSE':0.7848611027435858
train classification report
              precision    recall  f1-score   support

           0       0.56      0.99      0.72      7516
           1       0.98      0.42      0.59     10027

    accuracy                           0.66     17543
   macro avg       0.77      0.70      0.65     17543
weighted avg       0.80      0.66      0.64     17543

test classification report
              precision    recall  f1-score   support

           0       0.49      0.91      0.64      1033
           1       0.83      0.31      0.45      1405

    accuracy                           0.57      2438
   macro avg       0.66      0.61      0.55      2438
weighted avg       0.69      0.57      0.53      2438

precision@10 of train:  0.3506151142355007
recall@10 of train:  0.9484215377629497
precision@10 of test:  0.13938356164383595
recall@10 of test:  0.839041095890411
PERSONAL PRECISION

In [17]:
models = {
    "XGBoost": xgb.XGBRegressor(
         random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
       
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
         random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        random_state=0
    ),
    "DecisionTree": DecisionTreeRegressor(
        random_state=0
    )
}

In [18]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.6212227577693992
RMSE':0.7958271000811171
train classification report
              precision    recall  f1-score   support

           0       0.55      0.98      0.70      7516
           1       0.96      0.40      0.56     10027

    accuracy                           0.65     17543
   macro avg       0.75      0.69      0.63     17543
weighted avg       0.78      0.65      0.62     17543

test classification report
              precision    recall  f1-score   support

           0       0.50      0.91      0.64      1033
           1       0.84      0.32      0.47      1405

    accuracy                           0.57      2438
   macro avg       0.67      0.62      0.55      2438
weighted avg       0.69      0.57      0.54      2438

precision@10 of train:  0.35057996485061493
recall@10 of train:  0.9485754374506572
precision@10 of test:  0.13938356164383595
recall@10 of test:  0.839041095890411
PERSONAL PRECISIO