In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

# 1. 데이터 로드

In [3]:
# 실습 환경은 인코딩 없이 진행 됨
train = pd.read_csv('data/X_train.csv', encoding='euc-kr') 
target = pd.read_csv('data/y_train.csv')
test = pd.read_csv('data/X_test.csv', encoding='euc-kr')
print(train.shape, target.shape, test.shape)
df = train.merge(target, on='cust_id', how='left')
df = pd.concat([df, test])
print(df.shape)

(3500, 10) (3500, 2) (2482, 10)
(3500, 11)


# 2. 전처리 & Feature Engineering

In [None]:
print(df.info())
print(df.describe(include="all") )

In [None]:
print(sum(df['환불금액'].isna())/df.shape[0], df['환불금액'].min())
df['환불금액'] = df['환불금액'].fillna(0)
print(df.isna().sum().sum())

In [8]:
# object - 주구매상품, 주구매지점 
#print(df['주구매상품'].value_counts())
#print(df['주구매지점'].value_counts())

from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
df['주구매상품'] = enc.fit_transform(df['주구매상품'])
df['주구매지점'] = enc.fit_transform(df['주구매지점'])

In [None]:
df.corr()[(abs(df.corr()) > 0.7) & (abs(df.corr()) != 1)]

In [10]:
df.drop('최대구매액', axis=1, inplace=True)

In [11]:
df = pd.get_dummies(df, columns=['주구매상품','주구매지점'], drop_first=True)

# 3. 분류 알고리즘 사용

In [12]:
# 데이터 분리
from sklearn.model_selection import train_test_split

In [13]:
train = df[~df.gender.isna()]
test = df[df.gender.isna()]

In [14]:
target = train.gender
train.drop(['cust_id'], axis=1, inplace=True)

train.drop(['gender'], axis=1, inplace=True)
test.drop(['gender'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2450, 70) (1050, 70) (2450,) (1050,)


In [15]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier

In [16]:
#dir(sklearn.ensemble)
#help(sklearn.ensemble.RandomForestClassifier)
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

# 4. 초매개변수 최적화

In [17]:
from sklearn.model_selection import GridSearchCV

In [19]:
parmas = {'n_estimators':[100,200], 'criterion':["gini", "entropy"]}
rfc_gs = GridSearchCV(rfc, param_grid=parmas, scoring='roc_auc', cv=3, refit=True, return_train_score=True)
rfc_gs.fit(X_train, y_train)
print(rfc_gs.best_params_)
rfc = RandomForestClassifier(criterion= 'gini', n_estimators= 200)

{'criterion': 'gini', 'n_estimators': 200}


# 5. 모형 앙상블

In [20]:
vot = VotingClassifier(estimators=[
    ('rfc',rfc), ('gbc',gbc)
], voting='soft')
vot.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(n_estimators=200)),
                             ('gbc', GradientBoostingClassifier())],
                 voting='soft')

# 6. 예측

In [22]:
from sklearn.metrics import roc_auc_score

In [23]:
pred = vot.predict(X_test)
print(roc_auc_score(pred, y_test))

0.6102262288445669


# 7. 제출 

In [24]:
pred = vot.predict_proba(test.drop('cust_id', axis=1, inplace=False))
pred.shape

(2482, 2)

In [25]:
pred # 1일 확률 

array([[0.66555393, 0.33444607],
       [0.82725971, 0.17274029],
       [0.79211151, 0.20788849],
       ...,
       [0.33111937, 0.66888063],
       [0.66668592, 0.33331408],
       [0.49444108, 0.50555892]])

In [26]:
pred[:,1]

array([0.33444607, 0.17274029, 0.20788849, ..., 0.66888063, 0.33331408,
       0.50555892])

In [27]:
test['gender'] = pred[:,1]

In [29]:
# 답안 제출 예시
# 수험번호.csv 생성
test[['cust_id', 'gender']].to_csv("0000.csv", index=False)