In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import pickle

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

- [출처](https://yaeyang0629.tistory.com/entry/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D%ED%8C%8C%EC%9D%B4%EC%8D%AC-Random-Forest-Regressor%ED%9A%8C%EA%B7%80)

In [2]:
def make_train_df(rating,clothes,user):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    encoder = OneHotEncoder(sparse_output=False)
    #print(df.head())
    df_encoded = encoder.fit_transform(df.loc[:,'r_gender':'분위기'])
    
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded

    return df,df_rating,encoder

In [3]:
def make_test_df(rating,clothes,user,encoder):
    df = pd.merge(rating,user,how='inner',on='R_id')
    df = pd.merge(df,clothes,how='inner',on='image')
    
    df=df.drop(columns=['스타일선호','mar','job','income','r_style1','r_style2','r_style3','r_style4','r_style5'])
    df_rating = df['선호여부']
    df = df.drop(columns=['선호여부'])
    df_encoded = encoder.transform(df.loc[:,'r_gender':'분위기'])
    df_encoded = pd.DataFrame(df_encoded,columns= [f"col{i}_{elem}" for i,sublist in enumerate(encoder.categories_) for elem in sublist])
    #df = pd.concat([df[['R_id','image']],df_encoded,df.loc[:,'멋있다':].astype(np.int8)],axis=1)
    df = pd.concat([df_encoded,df.loc[:,'멋있다':]],axis=1)
    #df = df_encoded
    return df,df_rating

In [4]:
rating = pd.read_csv('../preprocessed/TL_man_rating_2019.csv')
clothes = pd.read_csv('../preprocessed/TL_man_clothes_2019.csv',index_col='image')
user = pd.read_csv('../preprocessed/TL_man_user_2019.csv',index_col='R_id')

train_x,train_y,encoder = make_train_df(rating,clothes,user)

In [5]:
train_x.to_csv('../train/train_x_man.csv',index=False)
train_y.to_csv('../train/train_y_man.csv',index=False)

In [6]:
with open('../encoder/onehot_encoder_man.pkl','wb') as f:
    pickle.dump(encoder,f)

In [7]:
rating = pd.read_csv('../preprocessed/VL_man_rating_2019.csv')
clothes = pd.read_csv('../preprocessed/VL_man_clothes_2019.csv',index_col='image')
user = pd.read_csv('../preprocessed/VL_man_user_2019.csv',index_col='R_id')


In [8]:
test_x,test_y = make_test_df(rating,clothes,user,encoder)

In [9]:
def precision_at_k(rating_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = pd.concat([rating_df,predict_df],axis=1)

    precisions = []

    for user in rating_df['R_id'].unique():
        if len(rating_df[rating_df['R_id']==user])>=2:
            pred = rating_df.loc[rating_df['R_id'] == user,['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
            pred_k = pred.head(k)
            precision = len(pred_k.loc[pred_k['선호여부']>=3.0]) / float(k)
            precisions.append(precision)
                
    return sum(precisions) / len(precisions)


def recall_at_k(rating_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    
    predict_df = pd.DataFrame({'예측': predict_value})
 
    rating_df = pd.concat([rating_df,predict_df],axis=1)
    recalls = []

  
    for user in rating_df['R_id'].unique():
        if len(rating_df[rating_df['R_id']==user])>=2:
            pred = rating_df.loc[rating_df['R_id']==user,['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
            pred_k = pred.head(k)
            recall = len(pred_k.loc[pred_k['선호여부']>=3.0]) / len(pred.loc[pred['선호여부']>=3]) if len(pred.loc[pred['선호여부']>=3.0])>0 else 0
            recalls.append(recall)

    return sum(recalls) / len(recalls)

In [10]:
from sklearn.metrics import classification_report
def map_to_binary(value):
    if value<3.0:
        return 0
    return 1

In [11]:
def precision_at_k_by_personal(rating_,user_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    user_df = pd.read_csv(user_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = rating_df.merge(user_df,how='inner',on='R_id')
    rating_df = pd.concat([rating_df,predict_df],axis=1)

    precisions = []

    for color in rating_df['personal_color'].unique():
        for face in rating_df['faceshape'].unique():
            for body in rating_df['bodyshape'].unique():

                pred = rating_df.loc[(rating_df['personal_color']==color) & (rating_df['faceshape']==face) & (rating_df['bodyshape']==body),['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
                pred_k = pred.head(k)
                if len(pred_k)>0:
                    precision = len(pred_k.loc[pred_k['선호여부']>=3.0]) / float(k)
                    precisions.append(precision)

    return sum(precisions) / len(precisions)


def recall_at_k_by_personal(rating_,user_,predict_value,k):
    rating_df = pd.read_csv(rating_)
    user_df = pd.read_csv(user_)
    predict_df = pd.DataFrame({'예측': predict_value})
    rating_df = rating_df.merge(user_df,how='inner',on='R_id')
    rating_df = pd.concat([rating_df,predict_df],axis=1)
    recalls = []

    for color in rating_df['personal_color'].unique():
        for face in rating_df['faceshape'].unique():
            for body in rating_df['bodyshape'].unique():

                pred = rating_df.loc[(rating_df['personal_color']==color) & (rating_df['faceshape']==face) & (rating_df['bodyshape']==body),['image','선호여부','예측']].sort_values(by='예측',ascending=False).reset_index(drop=True)
                pred_k = pred.head(k)
                if len(pred_k)>0:
                    recall = len(pred_k.loc[pred_k['선호여부']>=3.0]) / len(pred.loc[pred['선호여부']>=3.0]) if len(pred.loc[pred['선호여부']>=3.0])>0 else 0
                    recalls.append(recall)

    return sum(recalls) / len(recalls)

In [12]:
import math
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
def test_model(reg):
    train_predict = reg.predict(train_x)
    print("Train RMSE:{}".format(math.sqrt(mean_squared_error(train_predict, train_y))) )
    test_predict = reg.predict(test_x)
    print("Test RMSE':{}".format(math.sqrt(mean_squared_error(test_predict, test_y))) )
    train_predict_binary = np.vectorize(map_to_binary)(train_predict)
    train_y_binary = np.vectorize(map_to_binary)(train_y)
    test_predict_binary = np.vectorize(map_to_binary)(test_predict)
    test_y_binary = np.vectorize(map_to_binary)(test_y)
    print('train classification report')
    print(classification_report(train_y_binary,train_predict_binary))
    print('test classification report')
    print(classification_report(test_y_binary,test_predict_binary))
    #precision@k of train
    k=10
    print(f"precision@{k} of train" , precision_at_k('../preprocessed/TL_man_rating_2019.csv',train_predict,k))
    print(f"recall@{k} of train" , recall_at_k('../preprocessed/TL_man_rating_2019.csv',train_predict,k))
    #validation data에는 아이디당 옷을 1~2개 밖에 평가하지 않음. 따라서 precision는 오류가 있음
    print(f"precision@{k} of test: ",precision_at_k('../preprocessed/VL_man_rating_2019.csv',test_predict,k))
    print(f"recall@{k} of test: ",recall_at_k('../preprocessed/VL_man_rating_2019.csv',test_predict,k))
    print('PERSONAL PRECISION AND RECALL')
    k=10
    print(f"precision@{k} of train" , precision_at_k_by_personal('../preprocessed/TL_man_rating_2019.csv','../preprocessed/TL_man_user_2019.csv',train_predict,k))
    print(f"recall@{k} of train" , recall_at_k_by_personal('../preprocessed/TL_man_rating_2019.csv','../preprocessed/TL_man_user_2019.csv',train_predict,k))
    print(f"precision@{k} of test: ",precision_at_k_by_personal('../preprocessed/VL_man_rating_2019.csv','../preprocessed/VL_man_user_2019.csv',test_predict,k))
    print(f"recall@{k} of test: ",recall_at_k_by_personal('../preprocessed/VL_man_rating_2019.csv','../preprocessed/VL_man_user_2019.csv',test_predict,k))
    

In [13]:
models = {
    "XGBoost": xgb.XGBRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, gamma=0,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=500, learning_rate=0.05, num_leaves=31,
        max_depth=6, subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=5,
        min_samples_split=5, min_samples_leaf=2, subsample=0.8,
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=300, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        iterations=500, learning_rate=0.05, depth=6,
        l2_leaf_reg=1, verbose=0,subsample=0.8,random_state=0
    ),
    "DecisionTree": DecisionTreeRegressor(
        max_depth=10, min_samples_split=5, min_samples_leaf=2,
        random_state=0
    )
}


In [14]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)
    

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.5815538785639204
Test RMSE':0.7458876879472612
train classification report
              precision    recall  f1-score   support

           0       0.63      0.99      0.77      6883
           1       0.97      0.36      0.52      6224

    accuracy                           0.69     13107
   macro avg       0.80      0.67      0.65     13107
weighted avg       0.79      0.69      0.65     13107

test classification report
              precision    recall  f1-score   support

           0       0.59      0.95      0.73       939
           1       0.85      0.29      0.43       862

    accuracy                           0.63      1801
   macro avg       0.72      0.62      0.58      1801
weighted avg       0.71      0.63      0.59      1801

precision@10 of train 0.3301809690601292
recall@10 of train 0.9248725655301728
precision@10 of test:  0.13402298850574776
recall@10 of test:  0.7724137931034483
PERSONAL PRECISI

In [15]:
models = {
    "XGBoost": xgb.XGBRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, gamma=0,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        n_estimators=1000, learning_rate=0.05, num_leaves=31,
        max_depth=6, subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.1, reg_lambda=1, random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=5,
        min_samples_split=5, min_samples_leaf=2, subsample=0.8,
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
        n_estimators=1000, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        iterations=1000, learning_rate=0.05, depth=6,
        l2_leaf_reg=1, subsample=0.8, verbose=0,random_state=0
    ),
    "DecisionTree": DecisionTreeRegressor(
        max_depth=10, min_samples_split=5, min_samples_leaf=2,
        random_state=0
    )
}

In [16]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.500304935756086
Test RMSE':0.7623015301790759
train classification report
              precision    recall  f1-score   support

           0       0.65      1.00      0.78      6883
           1       0.99      0.40      0.57      6224

    accuracy                           0.71     13107
   macro avg       0.82      0.70      0.68     13107
weighted avg       0.81      0.71      0.68     13107

test classification report
              precision    recall  f1-score   support

           0       0.60      0.94      0.73       939
           1       0.83      0.30      0.45       862

    accuracy                           0.64      1801
   macro avg       0.71      0.62      0.59      1801
weighted avg       0.71      0.64      0.59      1801

precision@10 of train 0.33199065966141367
recall@10 of train 0.9266643727049735
precision@10 of test:  0.13402298850574776
recall@10 of test:  0.7724137931034483
PERSONAL PRECISI

In [17]:
models = {
    "XGBoost": xgb.XGBRegressor(
         random_state=0
    ),
    "LightGBM": lgb.LGBMRegressor(
        random_state=0
    ),
    "GradientBoosting": GradientBoostingRegressor(
       
        random_state=0
    ),
    "RandomForest": RandomForestRegressor(
         random_state=0
    ),
    "CatBoost": CatBoostRegressor(
        random_state=0
    ),
    "DecisionTree": DecisionTreeRegressor(
        random_state=0
    )
}

In [18]:
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    model.fit(train_x, train_y)
    print(f"Evaluating {model_name} model...")
    test_model(model)

Training XGBoost model...
Evaluating XGBoost model...
Train RMSE:0.5647244256382448
Test RMSE':0.7670884839585537
train classification report
              precision    recall  f1-score   support

           0       0.64      0.99      0.78      6883
           1       0.97      0.38      0.54      6224

    accuracy                           0.70     13107
   macro avg       0.81      0.68      0.66     13107
weighted avg       0.80      0.70      0.67     13107

test classification report
              precision    recall  f1-score   support

           0       0.60      0.95      0.74       939
           1       0.85      0.31      0.46       862

    accuracy                           0.64      1801
   macro avg       0.72      0.63      0.60      1801
weighted avg       0.72      0.64      0.60      1801

precision@10 of train 0.33076474022183394
recall@10 of train 0.9255555582640049
precision@10 of test:  0.13402298850574776
recall@10 of test:  0.7724137931034483
PERSONAL PRECIS