# 데이터 전처리

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import joblib

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(train.columns.tolist())
train.head()

['id', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'target']


Unnamed: 0,id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,NumDealsPurchases,...,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,target
0,0,1974,Master,Together,46014.0,1,1,21-01-2013,21,10,...,8,7,0,0,0,0,0,0,0,541
1,1,1962,Graduation,Single,76624.0,0,1,24-05-2014,68,1,...,7,1,1,0,0,0,0,0,0,899
2,2,1951,Graduation,Married,75903.0,0,1,08-04-2013,50,2,...,9,3,0,0,0,0,0,0,0,901
3,3,1974,Basic,Married,18393.0,1,0,29-03-2014,2,2,...,3,8,0,0,0,0,0,0,0,50
4,4,1946,PhD,Together,64014.0,2,1,10-06-2014,56,7,...,5,7,0,0,0,1,0,0,0,444


In [3]:
columns = ['Day_Customer','Month_Customer','Year_Customer']

train['Dt_Customer'] = train['Dt_Customer'].apply(lambda x: x.split('-'))
test['Dt_Customer'] = test['Dt_Customer'].apply(lambda x: x.split('-'))

for i, column in enumerate(columns):
    train[column] = train['Dt_Customer'].apply(lambda x: int(x[i]))
    test[column] = test['Dt_Customer'].apply(lambda x: int(x[i]))

train.drop(['Dt_Customer'], axis=1, inplace=True)
test.drop(['Dt_Customer'], axis=1, inplace=True)

### Ordinal Encoding
문자열 feature인 Education과 Martial_Status를 정수형으로 변환합니다.   
Martial_Status의 'Alone', 'YOLO', 'Absurd' 데이터는 앞선 EDA 과정에서 'Single' 범주로 통합하기로 판단했습니다.

In [4]:
single_list = ['Alone','YOLO','Absurd']
train['Marital_Status'] = train['Marital_Status'].apply(lambda x: 'Single' if x in single_list else x)
test['Marital_Status'] = test['Marital_Status'].apply(lambda x: 'Single' if x in single_list else x)

ordinal_features = ['Education','Marital_Status']
for feature in ordinal_features:
    rename_dict = {col: i for i, col in enumerate(train[feature].unique())}
    train[feature].replace(rename_dict, inplace=True)
    test[feature].replace(rename_dict, inplace=True)

In [5]:
train.head()

Unnamed: 0,id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,NumDealsPurchases,NumWebPurchases,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response,target,Day_Customer,Month_Customer,Year_Customer
0,0,1974,0,0,46014.0,1,1,21,10,7,...,0,0,0,0,0,0,541,21,1,2013
1,1,1962,1,1,76624.0,0,1,68,1,5,...,0,0,0,0,0,0,899,24,5,2014
2,2,1951,1,2,75903.0,0,1,50,2,6,...,0,0,0,0,0,0,901,8,4,2013
3,3,1974,2,2,18393.0,1,0,2,2,3,...,0,0,0,0,0,0,50,29,3,2014
4,4,1946,3,0,64014.0,2,1,56,7,8,...,0,0,1,0,0,0,444,10,6,2014


범주형 데이터의 종류가 늘어나지 않을거라는 가정 하에 unique 값에 대한 딕셔너리를 사용해 Ordinal Encoding을 진행했습니다.   
만약 확장성을 고려한다면 외부 라이브러리를 활용해서 OrdinalEncoder 객체를 생성할 필요가 있습니다.

### Remove Outliers
이상치로 여겨지는 행들을 제거합니다.

In [6]:
columns = ['Year_Birth','Income']
for column, outlier in zip(columns,[0,2]):
    cutted_data = pd.cut(train[column],bins=3,labels=[0,1,2])
    train = train[cutted_data != outlier]

In [7]:
columns = ['NumDealsPurchases','NumWebPurchases','NumWebVisitsMonth']

for column in columns:
    train = train[train[column] <= 13]

In [8]:
train.shape

(1096, 24)

이상치 제거 후 train 데이터는 기존 1108 행에서 12개 행이 제거되어 1096 행으로 감소했습니다.

### Derived Variables
최대한 다양한 특징을 보이기 위해 파생 변수를 생성했습니다.
- `Years_Before`: 고객이 회사에 등록되지 않은 기간 (연 단위)
- `Age_Range`: 고객 나이 구간 (30대부터 70대 사이, 20대와 80대는 가까운 범위와 병합)
- `Income_Level`: 고객 나이 구간 별 소득 수준
- `Income_Per`: 고객 연간 개인 소득
- `NumAcceptedCmp`: 고객이 캠페인에서 제안을 수락한 횟수
- `Perferred_Purchase`: 고객이 선호하는 구매 방식

In [9]:
for df in [train,test]:
    df['Years_Before'] = df['Year_Customer']-df['Year_Birth']
    df['Age_Range'] = ((2022-df['Year_Birth'])//10).replace({2:3,8:7})

    max_income = df[['Age_Range','Income']].groupby('Age_Range').max()
    df['Income_Level'] = [row['Income']/max_income.loc[row['Age_Range']][0] for _,row in df.iterrows()]
    df['Income_Per'] = df['Income']/(df['Kidhome']+1)

    campains = [f'AcceptedCmp{i}' for i in range(1,6)]+['Response']
    df['NumAcceptedCmp'] = sum([df[campain] for campain in campains])

    purchase_cat = [f'Num{t}Purchases' for t in ['Web','Catalog','Store']]
    purchase_dict = {cat: i for i, cat in enumerate(purchase_cat)}
    df['Perferred_Purchase'] = [purchase_dict[row.index[row.argmax()]] for _,row in df[purchase_cat].iterrows()]

In [10]:
columns = train.drop(['id','target'], axis=1).columns.tolist()
train = train.reindex(columns=['id']+sorted(columns)+['target'])
test = test.reindex(columns=['id']+sorted(columns))
train.to_csv('customer_data/train.csv', index=False)
test.to_csv('customer_data/test.csv', index=False)

In [11]:
train.head()

Unnamed: 0,id,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Age_Range,Complain,Day_Customer,Education,...,NumWebPurchases,NumWebVisitsMonth,Perferred_Purchase,Recency,Response,Teenhome,Year_Birth,Year_Customer,Years_Before,target
0,0,0,0,0,0,0,4,0,21,0,...,7,7,2,21,0,1,1974,2013,39,541
1,1,0,0,1,0,0,6,0,24,1,...,5,1,1,68,0,1,1962,2014,52,899
2,2,0,0,0,0,0,7,0,8,1,...,6,3,2,50,0,1,1951,2013,62,901
3,3,0,0,0,0,0,4,0,29,2,...,3,8,0,2,0,0,1974,2014,40,50
4,4,1,0,0,0,0,7,0,10,3,...,8,7,0,56,0,1,1946,2014,68,444


### Train/Test Split
모델 성능 평가를 위한 train 데이터 분리

In [12]:
y_train = np.array(train[['target']])
x_train = train.drop(['id','target'], axis=1)
x_train, x_test, y_train, y_test = \
model_selection.train_test_split(x_train, y_train, test_size=0.3, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((767, 28), (329, 28), (767, 1), (329, 1))

### Feature Normalization

In [12]:
numerical_transformer = StandardScaler()
numerical_features = ['Day_Customer','Income','Income_Level','Income_Per','Month_Customer',
                        'NumAcceptedCmp','NumCatalogPurchases','NumDealsPurchases','NumStorePurchases',
                        'NumWebPurchases','NumWebVisitsMonth','Recency','Year_Birth','Years_Before']

categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4','AcceptedCmp5',
                        'Age_Range','Complain','Education','Kidhome','Marital_Status',
                        'Perferred_Purchase','Response','Teenhome','Year_Customer']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

y_train = np.array(train[['target']])
x_train = pipe.fit_transform(train.drop(['id','target'], axis=1))
test = pipe.transform(test)

# x_train = pipe.transform(x_train)
# x_test = pipe.transform(x_test)

joblib.dump(pipe, 'customer_data/pipe.pkl')

['customer_data/pipe.pkl']

In [14]:
x_train.shape, test.shape, y_train.shape, #y_test.shape

((1096, 55), (1108, 55), (1096, 1))

# Predict

In [16]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, ElasticNet
from xgboost import XGBRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [16]:
models = [
    ('Linear', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('BayesianRidge', BayesianRidge()),
    ('ElasticNet', ElasticNet()),
    ('XGBRegressor', XGBRegressor(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0)),
    ('LGBMRegressor', LGBMRegressor(random_state=0, n_estimators = 400, max_depth = 10)),
    ('LDA', LinearDiscriminantAnalysis())]

In [17]:
def NMAE(true, pred):
    score = np.mean(np.abs(true-pred)/true)
    return score

In [21]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', NMAE(y_test,model.predict(x_test)))

Linear:  0.9540820137258922
Ridge:  0.9633794771648224
Lasso:  6.014354156518352
BayesianRidge:  6.0115354684232605
ElasticNet:  5.886123046157515
XGBRegressor:  5.744259606441433
LGBMRegressor:  5.796361375788739
LDA:  5.858644954440946


In [22]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', mean_squared_error(y_test,model.predict(x_test)))

Linear:  61194.12327659606
Ridge:  61435.75727589369
Lasso:  60013.82394860607
BayesianRidge:  60440.89853346865
ElasticNet:  66167.39399621332
XGBRegressor:  51636.612295067054
LGBMRegressor:  46494.44703582266
LDA:  108963.13373860183


In [25]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', r2_score(y_test,model.predict(x_test)))

Linear:  0.8318388868677341
Ridge:  0.8311748779708532
Lasso:  0.8350823429088794
BayesianRidge:  0.8339087443060139
ElasticNet:  0.8181723663696981
XGBRegressor:  0.858102874312464
LGBMRegressor:  0.8722335160735384
LDA:  0.7005699096783844


In [17]:
model = LGBMRegressor(random_state=0, n_estimators = 100, max_depth = 8)
model.fit(x_train,y_train)

LGBMRegressor(max_depth=8, random_state=0)

In [19]:
sample = pd.read_csv('data/sample_submission.csv')
sample.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [20]:
pred = model.predict(test)

In [22]:
sample['target'] = pred
sample.head()

Unnamed: 0,id,target
0,0,399.274426
1,1,786.866137
2,2,747.371339
3,3,1090.668171
4,4,1356.801403


In [23]:
sample.to_csv('sample.csv',index=False)

In [20]:
train['target'].describe()

count    1096.000000
mean      621.876825
std       604.363476
min         8.000000
25%        71.000000
50%       418.500000
75%      1074.250000
max      2525.000000
Name: target, dtype: float64