In [1]:
import pandas as pd
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt  # 그래프 그리는 라이브러리
from sklearn.model_selection import train_test_split
from sklearn import metrics  # 평가를 위한 라이브러리
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import seaborn as sns  # 시각화 라이브러리
#정수 원핫인코딩
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [2]:
train = pd.read_csv('data/train.csv', index_col = 'day_text')
test = pd.read_csv('data/test.csv', index_col = 'day_text')

###### 결측치 알아보기

In [3]:
train.isnull().sum()

team_text      0
H_text         0
HR_text        0
S_text         0
ST_text        0
DU_text        0
ER_text        0
result_text    0
location       0
temp           0
rain           0
humidity       0
dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 4.7 to 7.7
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   team_text    7000 non-null   object 
 1   H_text       7000 non-null   int64  
 2   HR_text      7000 non-null   int64  
 3   S_text       7000 non-null   int64  
 4   ST_text      7000 non-null   int64  
 5   DU_text      7000 non-null   int64  
 6   ER_text      7000 non-null   int64  
 7   result_text  7000 non-null   int64  
 8   location     7000 non-null   object 
 9   temp         7000 non-null   float64
 10  rain         7000 non-null   float64
 11  humidity     7000 non-null   float64
dtypes: float64(3), int64(7), object(2)
memory usage: 710.9+ KB


##### 데이터 특징 과 타겟 분리 (타겟 = result_text) -> keras모델이 원하는 형식

##### team_text /location 특징을 숫자로 바꿔주기

In [5]:
train.drop('team_text', axis = 1 , inplace = True)
test.drop('team_text', axis = 1 , inplace = True)

In [6]:
train.drop('location', axis = 1 , inplace = True)
test.drop('location', axis = 1 , inplace = True)

In [7]:
y_train = train['result_text']
X_train = train.drop('result_text', axis = 1)
y_test = test['result_text']
X_test = test.drop('result_text', axis = 1)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 4.7 to 7.7
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   H_text       7000 non-null   int64  
 1   HR_text      7000 non-null   int64  
 2   S_text       7000 non-null   int64  
 3   ST_text      7000 non-null   int64  
 4   DU_text      7000 non-null   int64  
 5   ER_text      7000 non-null   int64  
 6   result_text  7000 non-null   int64  
 7   temp         7000 non-null   float64
 8   rain         7000 non-null   float64
 9   humidity     7000 non-null   float64
dtypes: float64(3), int64(7)
memory usage: 601.6+ KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3398 entries, 7.7 to 10.26토
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   H_text       3398 non-null   int64  
 1   HR_text      3398 non-null   int64  
 2   S_text       3398 non-null   int64  
 3   ST_text      3398 non-null   int64  
 4   DU_text      3398 non-null   int64  
 5   ER_text      3398 non-null   int64  
 6   result_text  3398 non-null   int64  
 7   temp         3398 non-null   float64
 8   rain         3398 non-null   float64
 9   humidity     3398 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 292.0+ KB


In [13]:
train['H_text'] = train['H_text'].astype('float')
train['HR_text'] = train['HR_text'].astype('float')
train['S_text'] = train['S_text'].astype('float')
train['ST_text'] = train['ST_text'].astype('float')
train['DU_text'] = train['DU_text'].astype('float')
train['ER_text'] = train['ER_text'].astype('float')
train['result_text'] = train['result_text'].astype('float')

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7000 entries, 4.7 to 7.7
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   H_text       7000 non-null   float64
 1   HR_text      7000 non-null   float64
 2   S_text       7000 non-null   float64
 3   ST_text      7000 non-null   float64
 4   DU_text      7000 non-null   float64
 5   ER_text      7000 non-null   float64
 6   result_text  7000 non-null   float64
 7   temp         7000 non-null   float64
 8   rain         7000 non-null   float64
 9   humidity     7000 non-null   float64
dtypes: float64(10)
memory usage: 601.6+ KB


In [15]:
test['H_text'] = test['H_text'].astype('float')
test['HR_text'] = test['HR_text'].astype('float')
test['S_text'] = test['S_text'].astype('float')
test['ST_text'] = test['ST_text'].astype('float')
test['DU_text'] = test['DU_text'].astype('float')
test['ER_text'] = test['ER_text'].astype('float')
test['result_text'] = test['result_text'].astype('float')
test['humidity'] = test['humidity'].astype('float')

In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3398 entries, 7.7 to 10.26토
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   H_text       3398 non-null   float64
 1   HR_text      3398 non-null   float64
 2   S_text       3398 non-null   float64
 3   ST_text      3398 non-null   float64
 4   DU_text      3398 non-null   float64
 5   ER_text      3398 non-null   float64
 6   result_text  3398 non-null   float64
 7   temp         3398 non-null   float64
 8   rain         3398 non-null   float64
 9   humidity     3398 non-null   float64
dtypes: float64(10)
memory usage: 292.0+ KB


### Keras Model

In [29]:
from keras.models import Sequential
from keras.layers import Dense,Dropout

model = Sequential()

model.add(Dense(400,activation='relu', input_shape=(1,9)))

model.add(Dense(200, activation='relu',input_shape=(1,9)))

model.add(Dense(100, activation='relu',input_shape=(1,9)))

model.add(Dense(20, activation='relu',input_shape=(1,9)))
model.add(Dense(1, activation='sigmoid'))

In [30]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [31]:
print("Train Accuracy: %.4f"%(model.evaluate(X_train,y_train)[1]))
print("Test Accuracy: %.4f"%(model.evaluate(X_test,y_test)[1]))

Train Accuracy: 0.5006
Test Accuracy: 0.4997


In [None]:
model.fit(X_train, y_train, epochs=50, batch_size=1, verbose=1)