In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(train.shape)
train.head(5)

(1108, 22)


Unnamed: 0,id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,NumDealsPurchases,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,target
0,0,1974,Master,Together,46014.0,1,1,21-01-2013,21,10,...,8,7,0,0,0,0,0,0,0,541
1,1,1962,Graduation,Single,76624.0,0,1,24-05-2014,68,1,...,7,1,1,0,0,0,0,0,0,899
2,2,1951,Graduation,Married,75903.0,0,1,08-04-2013,50,2,...,9,3,0,0,0,0,0,0,0,901
3,3,1974,Basic,Married,18393.0,1,0,29-03-2014,2,2,...,3,8,0,0,0,0,0,0,0,50
4,4,1946,PhD,Together,64014.0,2,1,10-06-2014,56,7,...,5,7,0,0,0,1,0,0,0,444


In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108 entries, 0 to 1107
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1108 non-null   int64  
 1   Year_Birth           1108 non-null   int64  
 2   Education            1108 non-null   object 
 3   Marital_Status       1108 non-null   object 
 4   Income               1108 non-null   float64
 5   Kidhome              1108 non-null   int64  
 6   Teenhome             1108 non-null   int64  
 7   Dt_Customer          1108 non-null   object 
 8   Recency              1108 non-null   int64  
 9   NumDealsPurchases    1108 non-null   int64  
 10  NumWebPurchases      1108 non-null   int64  
 11  NumCatalogPurchases  1108 non-null   int64  
 12  NumStorePurchases    1108 non-null   int64  
 13  NumWebVisitsMonth    1108 non-null   int64  
 14  AcceptedCmp3         1108 non-null   int64  
 15  AcceptedCmp4         1108 non-null   i

In [3]:
train_T=train['target']
target=[]
for i in range(len(train_T)):
    if train_T.iloc[i] < 250:
        target.append(1)
    elif train_T.iloc[i] < 1000:
        target.append(2)
    elif train_T.iloc[i] < 1800:
        target.append(3)
    else:
        target.append(4)
        
train["Range"] = target

In [4]:
train['Age'] = 2022 - train['Year_Birth']
test['Age'] = 2022 - test['Year_Birth']

In [5]:
#train set 및 test set에 적용
for data in [train, test]:
    for i in range(data.shape[0]):
        education = data['Education'].iloc[i]
        if education in ["Basic", "Graduation"]:
            data['Education'].iloc[i] = 0
        elif education in ["2n Cycle", "Master"]:
            data['Education'].iloc[i] = 1
        else:
            data['Education'].iloc[i] = 2

In [6]:
#train set 및 test set에 적용
for data in [train, test]:
    for i in range(data.shape[0]):
        marital_status = data['Marital_Status'].iloc[i]
        if marital_status in ["Married", "Together"]:
            data['Marital_Status'].iloc[i] = 0
        else:
            data['Marital_Status'].iloc[i] = 1

In [7]:
corr_matrix = train.corr()
corr_matrix["target"].sort_values(ascending=False)

target                 1.000000
Range                  0.954404
NumCatalogPurchases    0.798065
Income                 0.784084
NumStorePurchases      0.677785
NumWebPurchases        0.546082
AcceptedCmp5           0.458208
AcceptedCmp1           0.361102
AcceptedCmp4           0.256784
Response               0.242760
Age                    0.136035
AcceptedCmp2           0.129995
Recency                0.050873
AcceptedCmp3           0.040736
id                     0.034192
Complain              -0.058704
NumDealsPurchases     -0.072802
Teenhome              -0.109214
Year_Birth            -0.136035
NumWebVisitsMonth     -0.488252
Kidhome               -0.538365
Name: target, dtype: float64

#### 구매횟수 통합

In [8]:
train['TotalPurchases'] = train['NumCatalogPurchases']+train['NumStorePurchases']+train['NumWebPurchases']
# train['RCatalogPurchases'] = train['NumCatalogPurchases'] / train['TotalPurchases']
# train['RStorePurchases'] = train['NumStorePurchases'] / train['TotalPurchases']
# train['RWebPurchases'] = train['NumWebPurchases'] / train['TotalPurchases']

#### 필요없는 열 정리

In [9]:
train = train.drop(["Dt_Customer", "Year_Birth"], axis = 1)
test = test.drop(["Dt_Customer", "Year_Birth"], axis = 1)

### 훈련 데이터 분할 및 모델 학습

In [10]:
data_train_X = train.drop(['id', 'target'], axis = 1) #training 데이터에서 피쳐 추출
data_train_y = train.target #training 데이터에서 소비량 추출

In [11]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPRegressor
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True) #총 10번의 fold 진행
n = 0 #x번째 fold인지 기록

fold_target_pred = []
fold_score = []

for train_index, valid_index in skf.split(train, train['Range']): #range 기준으로 stratified k fold 진행
    n += 1

    val_pred = []      #validation set pred 결과 저장
    target_pred = []   #test set pred 결과 저장
    
    train_X = np.array(data_train_X.drop("Range", axis = 1)) #분배된 학습을 위해 생성한 Range feature 제거
    train_Y = np.array(data_train_y)
    
    X_train, X_valid = train_X[train_index], train_X[valid_index]
    y_train, y_valid = train_Y[train_index], train_Y[valid_index]
    
    X_test = np.array(test)

    ### Create Model ###
    val_pred_name = ["RandomForestRegressor", "LGBMRegressor", "XGBRegressor"]
    
    ### LGBMRegressor ###
    model = LGBMRegressor(random_state = 42, verbose = 0) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장
    
    ### RandomForestRegressor ###
    model = RandomForestRegressor(random_state = 42) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train) # 모델 학습
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장
    
    ### XGBRegressor ###
    model = XGBRegressor(random_state = 42) #추가적으로 하이퍼파라미터 튜닝 필요
    model.fit(X_train, y_train)
    val_pred.append(model.predict(X_valid))   # validation set pred 결과 저장
    target_pred.append(model.predict(X_test)) # test set pred 결과 저장
    

    ### voting ###
    ### average validation pred ###
    preds = np.array(val_pred[0])
    for i in range(1, len(val_pred)):
        preds += val_pred[i]
    
    preds = preds/len(val_pred)
    
    ### average target pred ###
    target_preds = np.array(target_pred[0])
    for i in range(1, len(target_pred)):
        target_preds += target_pred[i]
    
    target_preds = target_preds/len(target_pred)
    
    fold_target_pred.append(target_preds) # append final target pred
    
    print(f"========== fold {n} ==========")
    for i in range(len(val_pred)):
        print(f"{val_pred_name[i]} model NMAE : { NMAE(y_valid, val_pred[i].astype(int))}")
    print("==============================")
    print(f"Average NMAE { NMAE(y_valid, preds.astype(int))}")
    print("")
    
    fold_score.append(NMAE(y_valid, preds.astype(int)))


total_score = 0
for i in range(len(fold_score)):
    total_score += fold_score[i]
    
total_score = total_score/(len(fold_score))    

print("==============================")
print("Total Average NMAE %0.4f" %(total_score)) #최종 average score 출력

You can set `force_col_wise=true` to remove the overhead.
RandomForestRegressor model NMAE : 0.1952458966657565
GradientBoostingRegressor model NMAE : 0.2133582112193198
XGBRegressor model NMAE : 0.1932131913473499
Average NMAE 0.19477564394284155

You can set `force_col_wise=true` to remove the overhead.
RandomForestRegressor model NMAE : 0.1894903723314196
GradientBoostingRegressor model NMAE : 0.19235327617786946
XGBRegressor model NMAE : 0.203373916791085
Average NMAE 0.18567316720281982

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
RandomForestRegressor model NMAE : 0.13409330973839803
GradientBoostingRegressor model NMAE : 0.15487605799964335
XGBRegressor model NMAE : 0.14969065942357027
Average NMAE 0.13719357449552108

You can set `force_col_wise=true` to remove the overhead.
RandomForestRegressor model NMAE : 0.19132308576353974
GradientBoostingRegressor model NMAE : 0.1701336014940382
XGBRegressor mo

In [13]:
### average target pred ###
final_pred = np.array(fold_target_pred[0])
for i in range(1, len(fold_target_pred)):
    final_pred += fold_target_pred[i]

final_pred = final_pred/len(fold_target_pred)
final_pred = final_pred.astype(int)

In [14]:
submission = pd.read_csv("data/sample_submission.csv")
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [15]:
submission['target'] = final_pred
submission.head()

Unnamed: 0,id,target
0,0,868
1,1,714
2,2,916
3,3,930
4,4,936


In [16]:
submission.to_csv("submission_baseline3.csv",index=False)