# 샘플링과 모델링 연습

## 0.환경준비

### 01.라이브러리 로딩

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import pandas as pd
import numpy as np
from datetime import date

### 02.데이터 로딩


In [None]:
from Accuinsight.Lifecycle.tensorflow import accuinsight
accu = accuinsight()

#### 1) from HDFS
* hdfs_host, hdfs_file_path만 변경한 후 실행

In [None]:
hdfs_host = '10.31.200.106'
hdfs_file_path = '/user/hadoop/LabTest/pre_processed_data/part-00000-d73dd82a-42ae-4608-ac6a-4b36b671909b-c000.csv'

In [None]:
import json
import os
from collections import OrderedDict

a = OrderedDict()
a['host'] = hdfs_host
a['port'] = '8020'
a['filePath'] = hdfs_file_path
a['target'] = 'isFraud'

json_file_name = 'hdfs_FD_info.json'
storage_info_json_path = os.path.join(os.getcwd(), json_file_name)
storage_info_json_path

with open(storage_info_json_path, 'w', encoding='utf-8') as save_file:
        json.dump(a, save_file, indent='\t')

In [None]:
accu.get_file('/home/work/hdfs_FD_info.json')

#### 2) from S3
* aws_access_key_id, aws_secret_access_key 확인 필요

In [None]:
# s3에 접속하여 데이터 다운로드하기 위한 패키지
!pip3 install boto3

In [None]:
import boto3

# Hard coded strings as credentials, not recommended.
accessKey='AKIA2HASEMBXXCLOKFMP'
secretKey='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'

client = boto3.client('s3', aws_access_key_id=accessKey, aws_secret_access_key=secretKey )

bucket_name = 'accu-trial' # s3 버킷 명
file_name = 'data.csv' # 저장될 경로 및 파일명
key = 'accu-trial03/LabTest/pre_processed_data/part-00000-1b58514c-e0ae-43c9-b9d1-1a8a0049a0cb-c000.csv' # 다운로드할 파일경로 + 파일명

session = boto3.Session(aws_access_key_id=accessKey,aws_secret_access_key=secretKey)

s3 = session.resource('s3')
client.download_file(bucket_name, key, file_name)

#### 3) 워크스페이스로 부터 데이터 불러오기

In [None]:
path = '/home/work/data.csv'
data = pd.read_csv(path)

In [None]:
data['isFraud'].value_counts()

In [None]:
# 데이터를 좀 줄이겠습니다.
data2 = data.sample(200000)
data2['isFraud'].value_counts()

## 1.데이터 탐색

In [None]:
data2.shape

## 2.전처리 
이미 기본 전처리가 되어 있습니다.  
스케일링만 수행합니다.

In [None]:
# 전처리를 위한 함수 불러오기
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### 2.1 데이터 정리

In [None]:
col_drop = ['type','isType1','isType2','isType3','isType4']
data2.drop(col_drop, axis =1, inplace = True)

### 2.2 데이터 분할


In [None]:
target = 'isFraud'
x = data2.drop(target, axis = 1)
y = data2.loc[:, target]

In [None]:
train_val_x, test_x, train_val_y, test_y = train_test_split(x,y,test_size = .2)
train_x, val_x, train_y, val_y = train_test_split(train_val_x,train_val_y,test_size = .2)
train_y, val_y, test_y = train_y.values, val_y.values, test_y.values

In [None]:
train_x.shape, train_y.shape

### 2.3 스케일링

In [None]:
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)


## 3.모델링

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

* 있는 데이터 그대로 모델링을 수행해 봅시다.

In [None]:
#x의 변수 수를 담습니다.
input = train_x.shape[1]

In [None]:
def build_model():
    model = keras.Sequential([
        layers.Dense(h1_nodes, activation='relu', input_shape=[input, ]),
        layers.Dense(h2_nodes, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [None]:
# accu-insight 에서 수행해야 함.
accu.autolog('fraud-detect', best_weights = True, model_monitor = False)  # using model-monitor

In [None]:
# parameter
learning_rate = 0.01
h1_nodes = 8
h2_nodes = 8
epochs = 10

In [None]:
# 학습
model = build_model()
model.fit(train_x, train_y, epochs=epochs, validation_data=(val_x, val_y))

In [None]:
# 3. 예측하기
test_pred = model.predict_classes(test_x)
print(accuracy_score(test_y, test_pred))
print('-----------------------')
print(confusion_matrix(test_y, test_pred))
print('-----------------------')
print(classification_report(test_y, test_pred))

## 4.(Optional)Precision-Recall Curve & ROC

In [None]:
test_pred2 = model.predict(test_x)
test_pred2

예제 데이터를 가지고 간단히 실습합니다.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

path = 'https://raw.githubusercontent.com/DA4BAM/dataset/master/titanic.1.csv'
titanic = pd.read_csv(path, usecols=['Survived','Pclass','Embarked','Sex','Fare','Age'])

col_cat = ['Sex', 'Embarked', 'Pclass']
for v in col_cat : 
    if len(titanic[v].unique()) == 2 :
        dumm = pd.get_dummies(titanic[v], prefix = v, drop_first=True)
    else :
        dumm = pd.get_dummies(titanic[v], prefix = v)
    titanic = pd.concat([titanic , dumm ], axis = 1)
    titanic.drop(v, axis = 1, inplace = True)

target = 'Survived'
x = titanic.drop(target, axis = 1)
y = titanic.loc[:, target]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=.2)

model_lr = LogisticRegression()
model_lr.fit(train_x, train_y)
test_pred = model_lr.predict_proba(test_x)

In [None]:
# precision_recall_curve 그리는 함수.
plot_precision_recall_curve(model_lr, train_x, train_y)

그런데 모델과 예측값을 통해서 어떻게 그래프를 그리게 되는 것일까요?

In [None]:
# 
precision, recall, thresholds = precision_recall_curve(test_y, test_pred[:,1])
plt.plot(thresholds, precision[1:], label = 'precision')
plt.plot(thresholds, recall[1:], label= 'recall')
plt.legend()
plt.show()

In [None]:
plot_roc_curve(model_lr, train_x, train_y)

In [None]:
test_y.shape, test_pred.shape

In [None]:
# 
fpr, tpr, thresholds = roc_curve(test_y, test_pred[:,1])
plt.plot(thresholds, fpr, label = 'precision')
plt.plot(thresholds, tpr, label= 'recall')
plt.legend()
plt.show()

In [None]:
tpr