In [74]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from glob import glob

In [276]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [76]:
import cufflinks as cf
cf.go_offline(connected=True)

### Logistic Regression

In [289]:
def logisticReg(XTrain, yTrain, XValid, yValid):
    clf = LogisticRegression(random_state=0).fit(XTrain, yTrain)
    pred = clf.predict(XValid)
    return pred

### Desicion tree

In [250]:
def decisionT(XTrain, yTrain, XValid, yValid, mD, cT):
    clf = tree.DecisionTreeClassifier(max_depth = mD, criterion=cT)
    clf = clf.fit(XTrain, yTrain)
    pred = clf.predict(XValid)
    return pred

### Random Forest

In [278]:
def randomF(XTrain,yTrain, XValid, yValid, mD):
    rf = RandomForestClassifier(n_estimators=mD)
    rf.fit(XTrain, yTrain)
    pred = rf.predict(XValid)
    return pred

### DNN

In [198]:
import tensorflow as tf
from sklearn import preprocessing

In [273]:
def DNN(XTrain,yTrain, XValid, yValid, lr, bs):

#     XTrain = pd.DataFrame(preprocessing.scale(XTrain))
#     XValid = pd.DataFrame(preprocessing.scale(XValid))
    
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(XValid.shape[1], )),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adagrad(lr=lr)

    model.compile(optimizer=optimizer, loss='binary_crossentropy',
                 metrics=['accuracy'])

    model.fit(XTrain,yTrain, epochs=20,batch_size=bs, verbose=0)

    
    return model.predict(XValid).flatten()

In [186]:
train = pd.read_csv('kbo_train.csv')

train = train[train['WIN'] !=0.5].reset_index(drop=True)#무승부 제거

test = pd.read_csv("kbo_test.csv")
test = test[test['WIN'] !=0.5].reset_index(drop=True) #무승부 제거

In [111]:
# train = train.sample(len(train)).copy()
# test = test.sample(len(test)).copy()

#### Tuning hyperparameters

In [122]:
kf = KFold(n_splits=10)

In [291]:
accuracyList=[]

In [292]:
 for trainIdx, validIdx in kf.split(train):
        #shulffing
        train_ = train.iloc[trainIdx].sample(len(trainIdx))
        valid_ = train.iloc[validIdx].sample(len(validIdx))

        XTrain = train_.iloc[:,:-2]
        yTrain = train_.iloc[:,-1]

        XTest = valid_.iloc[:,:-2]
        yTest = valid_.iloc[:,-1]

        lg = logisticReg(XTrain,yTrain,XTest,yTest)
        accuracyList.append(np.mean(yTest==lg))

In [293]:
np.mean(accuracyList)

0.5532660483724314

In [261]:
dtResult = {"maxDepth":[],"accuracy":[],"criterion":[]}

In [262]:
for mD in tqdm(range(5,101,5)):
    for cT in ['gini','entropy']:
        dtResult['maxDepth'].append(mD)
        dtResult['criterion'].append(cT)
        
        accuracyList = []
        for trainIdx, validIdx in kf.split(train):
            #shulffing
            train_ = train.iloc[trainIdx].sample(len(trainIdx))
            valid_ = train.iloc[validIdx].sample(len(validIdx))

            XTrain = train_.iloc[:,:-2]
            yTrain = train_.iloc[:,-1]

            XTest = valid_.iloc[:,:-2]
            yTest = valid_.iloc[:,-1]
            
            dt = decisionT(XTrain,yTrain,XTest,yTest, mD, cT)
            accuracyList.append(np.mean(yTest==dt))
        dtResult['accuracy'].append(np.mean(accuracyList))

100%|██████████| 20/20 [00:27<00:00,  1.37s/it]


In [267]:
dtResult=pd.DataFrame(dtResult)

dtResult.iplot(mode='lines',x='maxDepth', y='accuracy', categories='criterion',
              xTitle='depth', yTitle='accuracy')


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



In [280]:
rfResult = {"maxDepth":[],"accuracy":[]}

In [282]:
for mD in tqdm(range(5,101,5)):
    
    rfResult['maxDepth'].append(mD)
    accuracyList = []
    for trainIdx, validIdx in kf.split(train):
        #shulffing
        train_ = train.iloc[trainIdx].sample(len(trainIdx))
        valid_ = train.iloc[validIdx].sample(len(validIdx))

        XTrain = train_.iloc[:,:-2]
        yTrain = train_.iloc[:,-1]

        XTest = valid_.iloc[:,:-2]
        yTest = valid_.iloc[:,-1]

        rf = randomF(XTrain,yTrain,XTest,yTest, mD)
        accuracyList.append(np.mean(yTest==rf))
    rfResult['accuracy'].append(np.mean(accuracyList))

100%|██████████| 20/20 [01:09<00:00,  3.48s/it]


In [283]:
rfResult=pd.DataFrame(rfResult)

rfResult.iplot(mode='lines',x='maxDepth', y='accuracy',
              xTitle='depth', yTitle='accuracy')

In [284]:
dnResult = {"learningRate":[],'batchSize':[],"accuracy":[]}

In [285]:
for lr in tqdm([0.05, 0.01, 0.005, 0.001]):
    for bS in [10,50,100,200]:
        dnResult['learningRate'].append(lr)
        dnResult['batchSize'].append(bS)
        
        accuracyList = []
        for trainIdx, validIdx in kf.split(train):
            #shulffing
            train_ = train.iloc[trainIdx].sample(len(trainIdx))
            valid_ = train.iloc[validIdx].sample(len(validIdx))

            XTrain = train_.iloc[:,:-2]
            yTrain = train_.iloc[:,-1]

            XTest = valid_.iloc[:,:-2]
            yTest = valid_.iloc[:,-1]
            
            dn = DNN(XTrain,yTrain,XTest,yTest, lr, bS)
            accuracyList.append(np.mean(yTest ==(dn>0.5)*1))
        dnResult['accuracy'].append(np.mean(accuracyList))

100%|██████████| 4/4 [06:29<00:00, 97.38s/it] 


In [286]:
dnResult=pd.DataFrame(dnResult)

dnResult.iplot(mode='lines',x='batchSize', y='accuracy', categories = 'learningRate',
              xTitle='batchSize', yTitle='accuracy')


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



### predict

In [296]:
XTrain = train.iloc[:,:-2]
yTrain = train.iloc[:,-1]

In [295]:
XTest = test.iloc[:,:-2]
yTest = test.iloc[:,-1]

In [297]:
lR = LogisticRegression(random_state=0).fit(XTrain, yTrain)

logisticResult = np.mean(lR.predict(XTest) == yTest)

In [302]:
dtR = tree.DecisionTreeClassifier(max_depth = 5, criterion='entropy').fit(XTrain, yTrain)

decisionResult = np.mean(dtR.predict(XTest)==yTest)

In [314]:
rfR = RandomForestClassifier(n_estimators=5)
rfR.fit(XTrain, yTrain)

np.mean(rfR.predict(XTest)==yTest)

RandomForestClassifier(n_estimators=5)

In [304]:
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(XTrain.shape[1], )),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(200, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adagrad(lr=0.05)

model.compile(optimizer=optimizer, loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(XTrain,yTrain, epochs=20,batch_size=100, verbose=0)

<tensorflow.python.keras.callbacks.History at 0x7f493845d9d0>

In [313]:
dnResult = np.mean((model.predict(XTest)>0.5).flatten()*1 == yTest)

0.5178997613365155

### trash


In [None]:
temp = df[['RUN','R_ENE','RUN_VS','R_ENE_VS',"WIN_RATIO","WIN_RATIO_ENE","WIN_RATIO_VS","WIN_RATIO_ENE_VS","G_ID","WIN"]]

temp[temp['WIN_RATIO'] - temp['WIN_RATIO_VS']>0.4].iloc[10:20,:]

 temp.corr()

In [None]:
final=[]
for dn in tqdm(dataNames):
    df = pd.read_csv(dataNames[0])
    df = df[df['WIN'] !=0]

    X = df.iloc[:,:-2]
    X = pd.DataFrame(StandardScaler().fit_transform(X))
    y = df.iloc[:,-1]
    y = y.replace(-1,0)




    result = {}
    result['Dt'] = []
    result['Rf'] = []
    result['Dn'] = []
    result['Dt_PCA'] = []
    result['Rf_PCA'] = []
    result['Dn_PCA'] = []
    for i in range(10):

        result['Dt'].append(decisionT(X,y,p))

        result['Rf'].append(rf(X,y,0.2))

        result['Dn'].append(DNN(X,y,0.2))


        X_ = pd.DataFrame(StandardScaler().fit_transform(X))

        pca = PCA(n_components=10)

        X_ = pd.DataFrame(pca.fit_transform(X_))

        result['Dt_PCA'].append(decisionT(X_,y,0.2))

        result['Rf_PCA'].append(rf(X_,y,0.2))

        result['Dn_PCA'].append(DNN(X_,y,0.2))
    
    final.append((result,dn))

In [None]:
for trainIdx, validIdx in kf.split(train):
    #shulffing
    train_ = train.iloc[trainIdx].sample(len(trainIdx))
    valid_ = train.iloc[validIdx].sample(len(validIdx))
    
    XTrain = train_.iloc[:,:-2]
    yTrain = train_.iloc[:,-1]
    
    XTest = valid_.iloc[:,:-2]
    yTest = valid_.iloc[:,-1]
    
    
    dt = decisionT(XTrain,yTrain,XTest,yTest)
    validResult['dt'].append(np.mean(yTest==dt))
    
    rf = randomF(XTrain,yTrain,XTest,yTest)
    validResult['rf'].append(np.mean(yTest==rf))
    
    dn = DNN(XTrain,yTrain,XTest,yTest)
    validResult['dnn'].append(np.mean(yTest ==(dn>0.5)*1))
    
    