# 지도학습(Classification)

In [1]:
import numpy as np
import pandas as pd

# 모델 라이브러리 선언
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import matplotlib as plt

## 1. 데이터 불러오기

In [2]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
csData = pd.read_csv("../Python_ST_EX/dataset/customer.csv")
csData.head()

Unnamed: 0,deposit,stock,label
0,1400000000,45000000,normal
1,1450000000,72000000,diamond
2,1500000000,61000000,diamond
3,1370000000,56000000,normal
4,1920000000,48000000,diamond


In [3]:
np.random.seed(1303)

## 2. 데이터 분리

In [4]:
# feature, label 컬럼 설정
featuresColumn = ['deposit', 'stock']
labelColumn = ['label']

# feature 데이터, label 데이터 분리
featuresData = csData[featuresColumn]
labelData = csData[labelColumn]

# train_test_split함수를 활용해 feature / label 데이터 자동 분리 7:3
feature_train, feature_test, label_train, label_test = train_test_split(featuresData, labelData, test_size = 0.3)

## 3. 모델선언 및 학습

In [5]:
# Define Model
model = svm.SVC()
# Learning (Feature & Label)
model.fit(feature_train, label_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## 4. 예측

In [6]:
# 테스트데이터 Feature만 가지고 예측하기
predict = model.predict(feature_test)
predict

array(['normal', 'diamond', 'diamond', ..., 'normal', 'normal', 'diamond'],
      dtype=object)

## 5. 데이터 정리

In [7]:
# 데이터프레임으로 변환
predictData = pd.DataFrame(predict)
# 컬럼명 주입
predictData.columns = ["PREDICT"]
predictData.head()

Unnamed: 0,PREDICT
0,normal
1,diamond
2,diamond
3,diamond
4,diamond


In [8]:
resultData = pd.concat([feature_test.reset_index(drop = True), label_test.reset_index(drop = True), predictData], axis = 1)
resultData.head()

Unnamed: 0,deposit,stock,label,PREDICT
0,1280000000,47000000,normal,normal
1,1270000000,80000000,diamond,diamond
2,1530000000,52000000,diamond,diamond
3,1850000000,35000000,diamond,diamond
4,1550000000,66000000,diamond,diamond


## 6. 결과 검증

In [9]:
# 결과 테스트하기
ac_score = accuracy_score(label_test, predict)
cl_report = classification_report(label_test, predict)

# 결과 리포트하기
print("Accuracy =", ac_score)
print("result =\n", cl_report)

Accuracy = 0.9883333333333333
result =
               precision    recall  f1-score   support

     diamond       0.98      1.00      0.99      3486
      normal       1.00      0.97      0.99      1755
         vip       1.00      0.97      0.99       759

   micro avg       0.99      0.99      0.99      6000
   macro avg       0.99      0.98      0.99      6000
weighted avg       0.99      0.99      0.99      6000



In [10]:
bmiData = pd.read_csv('../Python_ST_EX/dataset/bmi.csv')
bmiData.head()

Unnamed: 0,height,weight,label
0,140,45,normal
1,145,72,fat
2,150,61,fat
3,137,56,fat
4,192,48,thin


In [11]:
# feature, label 컬럼 설정
featuresColumnBmi = ['height', 'weight']
labelColumnBmi = ['label']

# feature 데이터, label 데이터 분리
featuresDataBmi = bmiData[featuresColumnBmi]
labelDataBmi = bmiData[labelColumnBmi]

# train_test_split함수를 활용해 feature / label 데이터 자동 분리 7:3
featureBmi_train, featureBmi_test, labelBmi_train, labelBmi_test = train_test_split(featuresDataBmi, labelDataBmi, test_size = 0.3)

In [12]:
# Define Model
modelBmi = svm.SVC()
# Learning (Feature & Label)
modelBmi.fit(featureBmi_train, labelBmi_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
# 테스트데이터 Feature만 가지고 예측하기
predictBmi = modelBmi.predict(featureBmi_test)
predictBmi

array(['thin', 'thin', 'normal', ..., 'thin', 'fat', 'thin'], dtype=object)

In [14]:
# 데이터프레임으로 변환
predictBmiData = pd.DataFrame(predictBmi)
# 컬럼명 주입
predictBmiData.columns = ["PREDICT"]
predictBmiData.head()

Unnamed: 0,PREDICT
0,thin
1,thin
2,normal
3,fat
4,fat


In [15]:
resultData = pd.concat([featureBmi_test.reset_index(drop = True), labelBmi_test.reset_index(drop = True), predictBmiData], axis = 1)
resultData.head()

Unnamed: 0,height,weight,label,PREDICT
0,175,35,thin,thin
1,173,55,thin,thin
2,188,71,normal,normal
3,156,73,fat,fat
4,129,54,fat,fat


In [165]:
items = ['fat', 'normal', 'thin']
resultDict = {}
for i in items:
    resultDict[i] = {}
    for j in items:
        resultDict[i][j] = 0
compareData = resultData[['label', 'PREDICT']]

for i in range(0, len(compareData)):
    resultDict[compareData['PREDICT'][i]][compareData['label'][i]] += 1
resultBmiDataFrame = pd.DataFrame(resultDict, columns = items)

for eachItem in items:
    eachSum = 0
    correctValue = 0
    for eachLabel in resultDict[eachItem]:
        if (eachItem == eachLabel):
            correctValue = resultDict[eachItem][eachLabel]
        eachSum += resultDict[eachItem][eachLabel]
    resultBmiDataFrame.loc["precision", eachItem] = correctValue / eachSum

for eachItem in items:
    eachSum = 0
    correctValue = 0
    for eachLabel in resultDict[eachItem]:
        if (eachItem == eachLabel):
            correctValue = resultDict[eachLabel][eachItem]
        eachSum += resultDict[eachLabel][eachItem]
    resultBmiDataFrame.loc["recall", eachItem] = correctValue / eachSum

for eachItem in items:
    eachPrecision = resultBmiDataFrame.loc["precision", eachItem]
    eachRecall = resultBmiDataFrame.loc["recall", eachItem]
    resultBmiDataFrame.loc["f1score", eachItem] = 2 * eachPrecision * eachRecall / (eachPrecision + eachRecall)
    
resultBmiDataFrame

Unnamed: 0,fat,normal,thin
fat,2314.0,2.0,0.0
normal,0.0,1743.0,2.0
thin,0.0,1.0,1938.0
precision,1.0,0.998282,0.998969
recall,0.999136,0.998854,0.999484
f1score,0.999568,0.998568,0.999227


In [159]:
a = {
    "a" : ["a1", "a2", "a3", "a4", "a5"],
    "b1" : ["rla", "ehd", "wns", "dlqslek", "agag"]
}
b = {
    "a" : ["a1", "a2", "a3", "a4", "a6"],
    "b2" : ["123132", "2144", "352", "4737", "353368"]
}
ad = pd.DataFrame(a)
bd = pd.DataFrame(b)

In [160]:
ad

Unnamed: 0,a,b1
0,a1,rla
1,a2,ehd
2,a3,wns
3,a4,dlqslek
4,a5,agag


In [161]:
bd

Unnamed: 0,a,b2
0,a1,123132
1,a2,2144
2,a3,352
3,a4,4737
4,a6,353368


In [162]:
pd.merge(ad, bd, how = "left", left_on = "a", right_on = "a")

Unnamed: 0,a,b1,b2
0,a1,rla,123132.0
1,a2,ehd,2144.0
2,a3,wns,352.0
3,a4,dlqslek,4737.0
4,a5,agag,
