# 초기 가설 설정

In [1]:
import pandas as pd
import os
import random
import numpy as np
import tensorflow as tf
import re

aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
train = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

product_features = products

products_reordered_mean = train.groupby(['product_id'])['reordered'].mean().to_frame('product_reorder_rate').reset_index()

#각 제품의 재구매율 피처에 추가
product_features = product_features.merge(products_reordered_mean, on = 'product_id')

health_keywords = ['low fat', 'gluten free', 'sugar free', 'organic', 'diet', 'zero sugar']

pattern_list = [
r'low[\s\-]?fat',
r'gluten[\s\-]?free',
r'sugar[\s\-]?free',
r'zero[\s\-]?sugar',
r'\bdiet\b',
r'\borganic\b'
]

combined_pattern = re.compile('|'.join(pattern_list), re.IGNORECASE)

product_features['healthcare'] = product_features['product_name'].apply(
lambda x: int(bool(combined_pattern.search(str(x))))
)

# 제품별 재구매율 계산
prod_reorder = train.groupby('product_id')['reordered'].mean().reset_index()
prod_reorder.columns = ['product_id', 'product_reorder_rate']

# 제품 정보와 결합하여 카테고리 부여
prod_cat = products.merge(prod_reorder, on='product_id', how='left')

# department 기준
dept_reorder = prod_cat.groupby('department_id')['product_reorder_rate'].mean().reset_index()
dept_reorder.columns = ['department_id', 'department_reorder_rate']

# aisle 기준
aisle_reorder = prod_cat.groupby('aisle_id')['product_reorder_rate'].mean().reset_index()
aisle_reorder.columns = ['aisle_id', 'aisle_reorder_rate']

# product_id 단위로 피처 부여
product_features = product_features.merge(dept_reorder, on='department_id', how='left')
product_features = product_features.merge(aisle_reorder, on='aisle_id', how='left')

train_data = train.merge(product_features, on='product_id', how='left')
train_data = train_data[['order_id', 'product_id', 'healthcare', 'add_to_cart_order', 'product_reorder_rate', 'department_reorder_rate', 'aisle_reorder_rate', 'reordered']]

## 데이터 정규화, 데이터셋 분리

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score, classification_report

# 데이터 정규화
sc = StandardScaler()
train_data = train_data.dropna()

# X, y 분리
X = train_data.drop(['order_id', 'product_id', 'reordered'], axis=1)
y = train_data['reordered']

X_scaled = sc.fit_transform(X)

# 데이터 분할 훈련, 테스트 6:4
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.4, random_state=42)

## 로지스틱 회귀

In [3]:
from sklearn.linear_model import LogisticRegression

# 모델(로지스틱 회귀) 호출
lr = LogisticRegression(random_state=42, C=0.1)

# 모델 학습
lr.fit(X_train, y_train)

## 랜덤 포레스트

In [4]:
from sklearn.ensemble import RandomForestClassifier

# 모델(랜덤 포레스트) 호출
rf = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=10, min_samples_split=2)

# 모델 학습
rf.fit(X_train, y_train)

## XGBoost

In [5]:
from xgboost import XGBClassifier

# 모델(XGBoost) 호출
xgb = XGBClassifier(random_state=42, learning_rate=0.01, n_estimators=500, max_depth=5, subsample=0.8)

# 모델 학습
xgb.fit(X_train, y_train)

## 모델 예측 및 평가

In [6]:
# 모델 예측
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

report_lr = classification_report(y_test, lr_pred, output_dict=True)
report_rf = classification_report(y_test, rf_pred, output_dict=True)
report_xgb = classification_report(y_test, xgb_pred, output_dict=True)

# ROC-AUC
roc_auc_lr = roc_auc_score(y_test, lr_pred)
roc_auc_rf = roc_auc_score(y_test, rf_pred)
roc_auc_xgb = roc_auc_score(y_test, xgb_pred)

print(f'로지스틱 회귀 ROC-AUC: {roc_auc_lr:.2f}')
print(f'랜덤 포레스트 ROC-AUC: {roc_auc_rf:.2f}')
print(f'XGBoost ROC-AUC: {roc_auc_xgb:.2f}')

print()
print('로지스틱 회귀')
print(pd.DataFrame(report_lr).T)
print()
print('랜덤 포레스트')
print(pd.DataFrame(report_rf).T)
print()
print('XGBoost')
print(pd.DataFrame(report_xgb).T)

로지스틱 회귀 ROC-AUC: 0.63
랜덤 포레스트 ROC-AUC: 0.63
XGBoost ROC-AUC: 0.63

로지스틱 회귀
              precision    recall  f1-score        support
0              0.640085  0.409686  0.499602  222097.000000
1              0.681543  0.845778  0.754830  331750.000000
accuracy       0.670902  0.670902  0.670902       0.670902
macro avg      0.660814  0.627732  0.627216  553847.000000
weighted avg   0.664918  0.670902  0.652482  553847.000000

랜덤 포레스트
              precision    recall  f1-score        support
0              0.643748  0.399474  0.493012  222097.000000
1              0.679407  0.852000  0.755978  331750.000000
accuracy       0.670534  0.670534  0.670534       0.670534
macro avg      0.661578  0.625737  0.624495  553847.000000
weighted avg   0.665108  0.670534  0.650527  553847.000000

XGBoost
              precision    recall  f1-score        support
0              0.645292  0.399398  0.493406  222097.000000
1              0.679640  0.853022  0.756524  331750.000000
accuracy       0.67111

In [7]:
from sklearn.metrics import f1_score

f1_lr = f1_score(y_test, lr_pred)
f1_rf = f1_score(y_test, rf_pred)
f1_xgb = f1_score(y_test, xgb_pred)


print(f1_lr, f1_rf, f1_xgb)

0.7548302225856958 0.7559777259500171 0.7565242497313308


In [8]:
import plotly.graph_objects as go

models = ['Logristic Regression', 'DNN', 'Random Forest', 'XGBoost']

roc_aucs = [0.63, 0.71, 0.63, 0.63]
f1_scores = [0.75, 0.87, 0.76, 0.76]

fig = go.Figure(data=[
    go.Bar(name='ROC-AUC', x=models, y=roc_aucs, text=roc_aucs),
    go.Bar(name='F1 Score', x=models, y=f1_scores, text=f1_scores)
])

fig.update_layout(
    barmode='group',
    title=dict(text='모델별 성능 지표 비교',
               x=0.5),
    xaxis_title='Model',
    yaxis_title='Score',
    template='plotly_white'
)

fig.show()

# 가설 변경한 피처 반영

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from joblib import parallel_backend

# 데이터 불러오기
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
train = pd.read_csv('order_products__train.csv')
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')

# 유저 정보 병합
train = train.merge(orders[['order_id', 'user_id']], on='order_id', how='left')

# 제품별 재구매율
product_reordered_rate = train.groupby('product_id')['reordered'].mean().reset_index()
product_reordered_rate.columns = ['product_id', 'product_reorder_rate']

# 유저별 총 주문 수 + 전체 reorder 비율
user_orders = orders.groupby('user_id')['order_number'].max().reset_index()
user_orders.columns = ['user_id', 'user_total_orders']

user_reorders = train.groupby('user_id')['reordered'].mean().reset_index()
user_reorders.columns = ['user_id', 'user_reorder_ratio']

# 유저 피처 결합
user_features = pd.merge(user_orders, user_reorders, on='user_id', how='left')

# 제품 + 유저 피처를 train에 merge
train_data = train.merge(product_reordered_rate, on='product_id', how='left')
train_data = train_data.merge(user_features, on='user_id', how='left')

# 제품별 재구매율 변화량 (rolling)
train_data['product_reorder_rate_change'] = train_data.groupby('product_id')['reordered'].transform(
    lambda x: x.rolling(window=10, min_periods=1).mean().diff().fillna(0)
)

# 훈련 셋 구성하기
train_data = train_data[['order_id', 'product_id', 'add_to_cart_order',
                         'product_reorder_rate', 'product_reorder_rate_change',
                         'user_total_orders', 'user_reorder_ratio', 'reordered']]

In [10]:
# 결측 제거 및 불필요 컬럼 제거
original_data = train_data.copy()
original_data.dropna(inplace=True)
original_data.drop(columns=['order_id', 'product_id'], inplace=True)

# 스케일링
scaler = MinMaxScaler()
scale_cols = ['add_to_cart_order', 'user_total_orders']
original_data[scale_cols] = scaler.fit_transform(original_data[scale_cols])

In [11]:
# X, y 분리
X = original_data.drop('reordered', axis=1)
y = original_data['reordered']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
from sklearn.linear_model import LogisticRegression

# 모델(로지스틱 회귀) 호출
lr = LogisticRegression(random_state=42, C=0.1)

# 모델 학습
lr.fit(X_train, y_train)

In [13]:
from sklearn.ensemble import RandomForestClassifier

# 모델(랜덤 포레스트) 호출
rf = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=10, min_samples_split=2)

# 모델 학습
rf.fit(X_train, y_train)

In [14]:
from xgboost import XGBClassifier

# 모델(XGBoost) 호출
xgb = XGBClassifier(random_state=42, learning_rate=0.01, n_estimators=500, max_depth=5, subsample=0.8)

# 모델 학습
xgb.fit(X_train, y_train)

In [15]:
# 모델 예측
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)

report_lr = classification_report(y_test, lr_pred, output_dict=True)
report_rf = classification_report(y_test, rf_pred, output_dict=True)
report_xgb = classification_report(y_test, xgb_pred, output_dict=True)

# ROC-AUC
roc_auc_lr = roc_auc_score(y_test, lr_pred)
roc_auc_rf = roc_auc_score(y_test, rf_pred)
roc_auc_xgb = roc_auc_score(y_test, xgb_pred)

print(f'로지스틱 회귀 ROC-AUC: {roc_auc_lr:.2f}')
print(f'랜덤 포레스트 ROC-AUC: {roc_auc_rf:.2f}')
print(f'XGBoost ROC-AUC: {roc_auc_xgb:.2f}')

print()
print('로지스틱 회귀')
print(pd.DataFrame(report_lr).T)
print()
print('랜덤 포레스트')
print(pd.DataFrame(report_rf).T)
print()
print('XGBoost')
print(pd.DataFrame(report_xgb).T)

로지스틱 회귀 ROC-AUC: 0.91
랜덤 포레스트 ROC-AUC: 0.91
XGBoost ROC-AUC: 0.91

로지스틱 회귀
              precision    recall  f1-score        support
0              0.922459  0.876186  0.898728  111159.000000
1              0.919675  0.950611  0.934887  165765.000000
accuracy       0.920736  0.920736  0.920736       0.920736
macro avg      0.921067  0.913399  0.916807  276924.000000
weighted avg   0.920792  0.920736  0.920372  276924.000000

랜덤 포레스트
              precision    recall  f1-score        support
0              0.929450  0.870393  0.898952  111159.000000
1              0.916640  0.955696  0.935761  165765.000000
accuracy       0.921455  0.921455  0.921455       0.921455
macro avg      0.923045  0.913045  0.917357  276924.000000
weighted avg   0.921782  0.921455  0.920986  276924.000000

XGBoost
              precision    recall  f1-score        support
0              0.927919  0.871355  0.898748  111159.000000
1              0.917121  0.954610  0.935490  165765.000000
accuracy       0.92119

In [16]:
from sklearn.metrics import f1_score

f1_lr = f1_score(y_test, lr_pred)
f1_rf = f1_score(y_test, rf_pred)
f1_xgb = f1_score(y_test, xgb_pred)


print(f1_lr, f1_rf, f1_xgb)

0.9348869495054968 0.9357606329723296 0.9354903548858134


In [17]:
import plotly.graph_objects as go

models = ['Logristic Regression', 'DNN', 'Random Forest', 'XGBoost']

roc_aucs = [0.91, 0.98, 0.91, 0.91]
f1_scores = [0.93, 0.93, 0.94, 0.94]

fig = go.Figure(data=[
    go.Bar(name='ROC-AUC', x=models, y=roc_aucs, text=roc_aucs),
    go.Bar(name='F1 Score', x=models, y=f1_scores, text=f1_scores)
])

fig.update_layout(
    barmode='group',
    title=dict(text='모델별 성능 지표 비교',
               x=0.5),
    xaxis_title='Model',
    yaxis_title='Score',
    template='plotly_white'
)

fig.show()