In [1]:
import pandas as pd
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt  # 그래프 그리는 라이브러리
from sklearn.model_selection import train_test_split
from sklearn import metrics  # 평가를 위한 라이브러리
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVR, SVR

import seaborn as sns  # 시각화 라이브러리

In [2]:
train = pd.read_csv('data/data.csv', index_col = 'day_text')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 740 entries, 7.31금 to 10.31토
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   time_text    740 non-null    object 
 1   team_text    740 non-null    object 
 2   location     740 non-null    object 
 3   H_text       740 non-null    int64  
 4   HR_text      740 non-null    int64  
 5   S_text       740 non-null    int64  
 6   ST_text      740 non-null    int64  
 7   DU_text      740 non-null    int64  
 8   ER_text      740 non-null    int64  
 9   result_text  740 non-null    int64  
 10  temp         740 non-null    float64
 11  rain         740 non-null    float64
 12  wind         740 non-null    float64
 13  d_wind       740 non-null    int64  
 14  humidity     740 non-null    int64  
 15  atmosphere   740 non-null    float64
 16  surface      740 non-null    float64
dtypes: float64(5), int64(9), object(3)
memory usage: 104.1+ KB


In [4]:
train.corr() # 상관관계

Unnamed: 0,H_text,HR_text,S_text,ST_text,DU_text,ER_text,result_text,temp,rain,wind,d_wind,humidity,atmosphere,surface
H_text,1.0,0.391909,0.092599,-0.098078,0.086495,-0.012345,0.422553,-0.012543,-0.015214,-0.041693,0.011638,0.040668,0.017977,-0.040077
HR_text,0.391909,1.0,-0.033677,-0.059332,-0.049183,-0.004062,0.269664,0.013775,-0.005094,-0.021086,0.000106,0.030995,0.006119,-0.013063
S_text,0.092599,-0.033677,1.0,0.004025,-0.021023,-0.039375,0.112246,0.015431,0.010188,0.025437,0.040763,0.003682,0.027279,0.013697
ST_text,-0.098078,-0.059332,0.004025,1.0,-0.226409,0.011206,-0.251901,0.068817,-0.09409,-0.025293,0.039729,0.035899,0.107218,0.058981
DU_text,0.086495,-0.049183,-0.021023,-0.226409,1.0,0.026558,-0.030695,-0.007008,-0.050524,0.054661,0.0095,-0.057874,-0.017506,-0.007002
ER_text,-0.012345,-0.004062,-0.039375,0.011206,0.026558,1.0,-0.152449,0.030468,0.049563,0.016587,0.008236,0.001046,-0.027363,0.031891
result_text,0.422553,0.269664,0.112246,-0.251901,-0.030695,-0.152449,1.0,-0.010046,0.000882,0.00408,-0.002168,0.002019,-0.00091,-0.015586
temp,-0.012543,0.013775,0.015431,0.068817,-0.007008,0.030468,-0.010046,1.0,0.059063,0.273238,0.161045,0.232028,0.227804,0.885897
rain,-0.015214,-0.005094,0.010188,-0.09409,-0.050524,0.049563,0.000882,0.059063,1.0,0.02142,-0.006092,0.260888,-2.5e-05,0.044553
wind,-0.041693,-0.021086,0.025437,-0.025293,0.054661,0.016587,0.00408,0.273238,0.02142,1.0,0.087908,-0.21815,0.11132,0.211262


# wind 랑 안타 관계 시각화

In [5]:
# bins = [0,2,4,6,8]  # 구간 만들기
# labels = ['0~2','2~4','4~6', '6~8']
# train['wind'] = pd.cut(train['wind'], bins = bins, labels = labels)
# test['wind'] = pd.cut(test['wind'], bins = bins, labels = labels)

In [6]:
# sns.countplot(data = train, x = 'wind', hue='H_text')

In [7]:
# sns.countplot(data = train, x = 'H_text', hue='wind')

## 장소 원핫 인코딩

In [8]:
train.drop('time_text', axis = 1 , inplace = True)

In [9]:
cat_feature = ['location','team_text']

In [10]:
for i in cat_feature:
    dummy = pd.get_dummies(train[i], prefix = i)
    train = pd.concat([train, dummy], axis = 1)
    train.drop(i, axis=1, inplace = True)
    

In [11]:
y_train = train['H_text']
X_train = train.drop('H_text', axis = 1)


X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(555, 31) (185, 31) (555,) (185,)


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 555 entries, 9.6일 to 10.22목
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   HR_text             555 non-null    int64  
 1   S_text              555 non-null    int64  
 2   ST_text             555 non-null    int64  
 3   DU_text             555 non-null    int64  
 4   ER_text             555 non-null    int64  
 5   result_text         555 non-null    int64  
 6   temp                555 non-null    float64
 7   rain                555 non-null    float64
 8   wind                555 non-null    float64
 9   d_wind              555 non-null    int64  
 10  humidity            555 non-null    int64  
 11  atmosphere          555 non-null    float64
 12  surface             555 non-null    float64
 13  location_광주         555 non-null    uint8  
 14  location_대구         555 non-null    uint8  
 15  location_대전         555 non-null    uint8  
 16  locatio

In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 185 entries, 8.25화 to 10.3토
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   HR_text             185 non-null    int64  
 1   S_text              185 non-null    int64  
 2   ST_text             185 non-null    int64  
 3   DU_text             185 non-null    int64  
 4   ER_text             185 non-null    int64  
 5   result_text         185 non-null    int64  
 6   temp                185 non-null    float64
 7   rain                185 non-null    float64
 8   wind                185 non-null    float64
 9   d_wind              185 non-null    int64  
 10  humidity            185 non-null    int64  
 11  atmosphere          185 non-null    float64
 12  surface             185 non-null    float64
 13  location_광주         185 non-null    uint8  
 14  location_대구         185 non-null    uint8  
 15  location_대전         185 non-null    uint8  
 16  locatio

In [14]:
set(X_train.columns) - set(X_test.columns)

set()

# KNN

In [15]:
knn_model = KNeighborsClassifier(n_neighbors = 3)
result = cross_val_score(knn_model,
                        X_train,
                        y_train,
                        cv = 5)
result



array([0.07207207, 0.04504505, 0.07207207, 0.09009009, 0.06306306])

In [16]:
result.mean()

0.06846846846846848

## KNN Scaler

In [17]:
scaler = StandardScaler()

In [18]:
scaler.fit(X_train)  # 평균과 표준편차로 scaling 한 값을 컴퓨터가 기억

StandardScaler()

In [19]:
transform_X_train = scaler.transform(X_train)
transform_X_train

array([[-0.86246882, -0.69124603, -1.0432453 , ..., -0.30417049,
        -0.35139975, -0.32830539],
       [ 0.10454167, -0.69124603,  0.3934821 , ..., -0.30417049,
        -0.35139975, -0.32830539],
       [-0.86246882,  0.41434921, -1.0432453 , ..., -0.30417049,
        -0.35139975,  3.04594448],
       ...,
       [ 0.10454167, -0.69124603,  0.03430025, ..., -0.30417049,
        -0.35139975, -0.32830539],
       [-0.86246882,  0.41434921,  0.3934821 , ..., -0.30417049,
        -0.35139975, -0.32830539],
       [-0.86246882,  0.41434921, -0.3248816 , ..., -0.30417049,
        -0.35139975, -0.32830539]])

In [20]:
transform_X_test = scaler.transform(X_test)
transform_X_test

array([[-0.86246882,  1.51994444, -0.3248816 , ..., -0.30417049,
         2.84576188, -0.32830539],
       [ 0.10454167,  0.41434921,  0.75266394, ..., -0.30417049,
        -0.35139975, -0.32830539],
       [-0.86246882, -0.69124603, -0.68406345, ..., -0.30417049,
        -0.35139975, -0.32830539],
       ...,
       [-0.86246882,  0.41434921,  1.47102764, ..., -0.30417049,
        -0.35139975, -0.32830539],
       [ 0.10454167, -0.69124603,  0.03430025, ..., -0.30417049,
        -0.35139975, -0.32830539],
       [ 1.07155217, -0.69124603, -1.76160899, ..., -0.30417049,
        -0.35139975, -0.32830539]])

In [21]:
result = cross_val_score(knn_model,
                        transform_X_train,
                        y_train,
                        cv = 5)
result



array([0.10810811, 0.11711712, 0.10810811, 0.09009009, 0.11711712])

In [22]:
result.mean()

0.10810810810810811

In [23]:
y_pred = knn_model.fit(transform_X_train, y_train).predict(transform_X_test)

In [24]:
print("총 승패 갯수 %d 에서 틀린 예측 갯수 : %d" % (transform_X_test.shape[0], (y_test != y_pred).sum()))

총 승패 갯수 185 에서 틀린 예측 갯수 : 161


In [25]:
print("예측 정확도: {:.2f}".format(np.mean(y_pred == y_test)))

예측 정확도: 0.13


### SVM Model 회귀분석

In [26]:
svm_model = LinearSVR(C=300)

In [27]:
y_pred = svm_model.fit(X_train, y_train).predict(X_test)



In [28]:
print("총 승패 갯수 %d 에서 틀린 예측 갯수 : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

총 승패 갯수 185 에서 틀린 예측 갯수 : 185


In [29]:
print("예측 정확도: {:.2f}".format(np.mean(y_pred == y_test)))

예측 정확도: 0.00


In [30]:
print("스코어 :{:.2f}".format(svm_model.score(X_test,y_test)))

스코어 :-0.01


In [31]:
from sklearn.metrics import mean_squared_error
y_pred = svm_model.predict(X_test)
print(mean_squared_error(y_test,y_pred))
np.sqrt(mean_squared_error(y_test,y_pred))

14.507495700292855


3.8088706594334303

In [45]:
print("테스트 세트 예측: {}".format(svm_model.predict(X_test)))

테스트 세트 예측: [ 8.05250357 11.8432526   5.30607518  4.31181785  8.89557094  9.58867757
  6.63932289 13.39384037  9.26785712  8.64742152 10.19456352  5.58550694
  6.64407845 10.64784737  8.39462017 10.16547992  5.93098644 12.85028597
  7.24964707  8.10520969 11.87656946  6.59447914  9.90466101 10.33775518
 10.29737497  2.88438309 13.40641838 10.74338696 11.61906626  8.39613138
 10.598677    7.15738922  8.44780053  7.3292367   9.97687048  5.25986279
  8.91153239 13.71786185  5.82305015 11.8478284   9.82638447  6.2354922
 15.89890943  7.39826767  6.56574981 11.7750027  12.27120684  7.58145231
 10.39616164  9.83061517  6.36385323  9.27225069  6.49377513  8.80043622
  8.48019012  7.74233037  5.17469017  6.25890615 11.53330677  9.15760281
  9.83180345  7.55943197 11.04832215 10.35905296  9.15282418  5.88298577
  9.45064065 10.01058863  7.17079099 13.6141551   6.74669662 10.76500684
  8.90088389 11.87695534  6.74505077  6.90913721  4.95227915  5.07031895
  7.26643584  7.22246253  8.21983562 10.3

In [46]:
print("테스트 세트 정확도: {:.2f}".format(svm_model.score(X_test, y_test)))

테스트 세트 정확도: -0.01


### Naive Bayes 분류 모델

In [32]:
from sklearn.naive_bayes import GaussianNB

In [33]:
nb = GaussianNB()

In [34]:
y_pred = nb.fit(X_train, y_train).predict(X_test)

In [35]:
print("총 승패 갯수 %d 에서 틀린 예측 갯수 : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

총 승패 갯수 185 에서 틀린 예측 갯수 : 182


In [36]:
print("예측 정확도: {:.2f}".format(np.mean(y_pred == y_test)))

예측 정확도: 0.02


### Decision Tree 모델 - 랜덤 포레스트, 그래디언트 부스팅, 배깅, 엑스트라 트리, 에이다 부스트, 앙상블

In [37]:
tree_model = DecisionTreeClassifier(max_depth = 7) # depth

In [38]:
result = cross_val_score(tree_model,
                        X_train,
                        y_train,
                        cv = 10)
result



array([0.125     , 0.17857143, 0.08928571, 0.01785714, 0.125     ,
       0.07272727, 0.12727273, 0.10909091, 0.09090909, 0.07272727])

In [39]:
result.mean()

0.10084415584415585

In [40]:
y_pred = tree_model.fit(X_train, y_train).predict(X_test)

In [41]:
print("총 승패 갯수 %d 에서 틀린 예측 갯수 : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

총 승패 갯수 185 에서 틀린 예측 갯수 : 163


In [42]:
print("예측 정확도: {:.2f}".format(np.mean(y_pred == y_test)))

예측 정확도: 0.12


#### 시각화

In [43]:
# !pip install graphviz
import graphviz

In [44]:
from sklearn.tree import export_graphviz
export_graphviz(tree_model, out_file="tree.dot", class_names=["안타","안타"],
               feature_names=X_train.wind, impurity=False, filled=True)

ValueError: Length of feature_names, 555 does not match number of features, 31