In [133]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Linear, Logistic
from sklearn.linear_model import LinearRegression, LogisticRegression

# knn
from knn import KNeighborsClassifier, KNeighborsRegressor

# decision
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# svm
from sklearn.svm import SVC, SVR

# bagging
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# boosting
from xgboost import XGBClassifier, XGBRegressor

# 회귀 모델링

In [134]:
data_path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/Carseats.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [135]:
data['Diff_Price'] = data['CompPrice'] - data['Price']
data.drop('CompPrice', axis = 1, inplace = True)

In [136]:
target = 'Sales'
x = data.drop(target, axis=1)
y = data[target]

In [137]:
dumy_col = ['ShelveLoc', 'Urban', 'US', 'Education']
x = pd.get_dummies(x, columns=dumy_col, drop_first=True)

In [138]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3)

In [139]:
# scaling
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.fit_transform(x_val)
y_val

151    10.77
271     4.55
10      9.01
68     13.39
186     8.68
       ...  
287     6.88
121    11.67
80      8.01
194     7.23
285     7.60
Name: Sales, Length: 120, dtype: float64

In [140]:
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant

def forward_stepwise_linear(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train)
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    # 
    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = OLS(y_train, add_constant(x_tr)).fit()
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

In [141]:
import statsmodels.api as sm

def forward_stepwise_logistic(x_train, y_train):

    # 변수목록, 선택된 변수 목록, 단계별 모델과 AIC 저장소 정의
    features = list(x_train) # 컬럼명
    selected = []
    step_df = pd.DataFrame({ 'step':[], 'feature':[],'aic':[]})

    for s in range(0, len(features)) :
        result =  { 'step':[], 'feature':[],'aic':[]}

        # 변수 목록에서 변수 한개씩 뽑아서 모델에 추가
        for f in features :
            vars = selected + [f]
            x_tr = x_train[vars]
            model = sm.Logit(y_train, x_tr).fit()
            result['step'].append(s+1)
            result['feature'].append(vars)
            result['aic'].append(model.aic)
        
        # 모델별 aic 집계
        temp = pd.DataFrame(result).sort_values('aic').reset_index(drop = True)

        # 만약 이전 aic보다 새로운 aic 가 크다면 멈추기
        if step_df['aic'].min() < temp['aic'].min() :
            break
        step_df = pd.concat([step_df, temp], axis = 0).reset_index(drop = True)

        # 선택된 변수 제거
        v = temp.loc[0,'feature'][s]
        features.remove(v)

        selected.append(v)
    
    # 선택된 변수와 step_df 결과 반환
    return selected, step_df

In [142]:
features, result = forward_stepwise_linear(x_train, y_train)
features

['Diff_Price',
 'ShelveLoc_Good',
 'Advertising',
 'ShelveLoc_Medium',
 'Age',
 'Income',
 'Urban_Yes',
 'Education_15']

In [143]:
x_train_f = x_train[features]
x_val_f = x_val[features]

In [144]:
k = int(y_val.count() ** 0.5)

params = {
    'model_line' : {
    }
    , 'model_knnc' : {
        'n_neighbors' : range(1, k+6)
        , 'metric': ['euclidean', 'manhattan']
    }
    , 'model_dec' : {
        'max_depth' : range(1,11)
        , 'min_samples_leaf': [20,30,50,70,100]
    }
    , 'model_svm' : {
        'C': range(1,11)
        , 'gamma' : np.linspace(0.01,1,20)
    }
    , 'model_ran' : {
        'max_features' : range(1, x_train.shape[1] + 1)
        , 'n_estimators' : range(10,151,10)
    }
    , 'model_xgb' : {
        'max_depth' : [3, 5, 8]
        , 'n_estimators':[50,100,150]
        , 'learning_rate':np.linspace(0.01,1,20)
    }
}

In [145]:
# model
model_line = LinearRegression()
model_knnc = KNeighborsRegressor()
model_dec = DecisionTreeRegressor()
model_svm = SVR()
model_ran = RandomForestRegressor()
model_xgb = XGBRegressor()
model = {'model_line':model_line, 'model_knnc':model_knnc, 'model_dec':model_dec, 'model_svm':model_svm, 'model_ran':model_ran, 'model_xgb':model_xgb}
# model = [
#     ['model_line', model_line]
#     , ['model_knnc', model_knnc]
#     , ['model_dec', model_dec]
#     , ['model_svm', model_svm]
#     , ['model_ran', model_ran]
#     , ['model_xgb', model_xgb]
#     ]

In [146]:
name = []
pred = []
rscore = []
mape = []
mae = []
rmse = []

for n in model:
    grid = GridSearchCV(model[n], params[n], cv=5)
    
    if n == 'model_knnc':
        grid.fit(x_train_s, y_train)
        pred_ = grid.predict(x_val_s)
    elif n == 'model_line':
        grid.fit(x_train_f, y_train)
        pred_ = grid.predict(x_val_f)
    else:
        grid.fit(x_train, y_train)
        pred_ = grid.predict(x_val)

    rscore_ = r2_score(y_val, pred_)
    mape_ = mean_absolute_percentage_error(y_val, pred_)
    mae_ = mean_absolute_error(y_val, pred_)
    rmse_ = mean_squared_error(y_val, pred_, squared=False)

    name.append(n)
    pred.append(pred_)
    rscore.append(rscore_)
    mape.append(mape_)
    mae.append(mae_)
    rmse.append(rmse_)
    


In [147]:
for i in range(len(model)):
    print(name[i])
    print(f'rscore : {rscore[i]}')
    print(f'mape : {mape[i]}')
    print(f'mae : {mae[i]}')
    print(f'rmse : {rmse[i]}')
    print('*' * 30)

model_line
rscore : 0.8148282927834269
mape : 0.1446576565895925
mae : 0.8911407609564745
rmse : 1.126266934590974
******************************
model_knnc
rscore : 0.3914328212340257
mape : 0.3402760492454009
mae : 1.6403863636363636
rmse : 2.0417768807042833
******************************
model_dec
rscore : 0.5017362893823694
mape : 0.27651714805115446
mae : 1.4951372348354548
rmse : 1.8474965997053634
******************************
model_svm
rscore : -0.019831908270699916
mape : 0.4431765641162025
mae : 2.115487068556428
rmse : 2.6431287384958866
******************************
model_ran
rscore : 0.6969901358856981
mape : 0.2236155149709766
mae : 1.1788833333333328
rmse : 1.4407293452610368
******************************
model_xgb
rscore : 0.7405089476252938
mape : 0.18746314886717225
mae : 1.0822550286451975
rmse : 1.333261113491894
******************************


# 분류모델링

In [148]:
path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/credit_all.csv'
data = pd.read_csv(path)
data.loc[data['Payment'] == 4, 'Payment'] = 3
data.head()

Unnamed: 0,Creditability,AccountBalance,Duration,Payment,Purpose,CreditAmount,Employment,SexMarital,CurrentAddress,MostValuableAsset,Age,Apartment,CreditCount,Occupation,Dependents,Telephone,ForeignWorker
0,1,3,24,2,0,1249,2,4,2,1,28,2,1,3,1,1,1
1,1,2,9,2,0,276,3,4,4,1,22,1,1,2,1,1,1
2,1,1,18,3,2,1049,2,2,4,2,21,1,1,3,1,1,1
3,1,1,24,3,1,6419,5,2,4,4,44,3,2,4,2,2,1
4,1,3,12,2,2,1424,5,2,4,1,55,2,1,4,1,2,1


In [149]:
target = 'Creditability'
x = data.drop(target, axis = 1)
y = data.loc[:, target]

In [150]:
dummy_vars = ['Employment', 'CurrentAddress', 'CreditCount', 'Dependents', 'Telephone', 'AccountBalance', 'Payment', 'Purpose', 'SexMarital', 'MostValuableAsset', 'Apartment','Occupation','ForeignWorker']
x = pd.get_dummies(x, columns = dummy_vars, drop_first = True)
x.head()

Unnamed: 0,Duration,CreditAmount,Age,Employment_2,Employment_3,Employment_4,Employment_5,CurrentAddress_2,CurrentAddress_3,CurrentAddress_4,...,SexMarital_4,MostValuableAsset_2,MostValuableAsset_3,MostValuableAsset_4,Apartment_2,Apartment_3,Occupation_2,Occupation_3,Occupation_4,ForeignWorker_2
0,24,1249,28,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,0
1,9,276,22,0,1,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,18,1049,21,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,24,6419,44,0,0,0,1,0,0,1,...,0,0,0,1,0,1,0,0,1,0
4,12,1424,55,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,1,0


In [151]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = .3)

In [152]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)