## Blue print

1. Check data shape and type.
2. Convert chracteristic values to numbers.
3. Split training dataset to *ourtrain* and *ourtest*.
4. Dimensionality reduction.
5. Try various models(logistic regression, decision tree, random forest, SVM, KNN, Naive Bayes).
6. Apply to the test dataset.


## 1. Data Investigation

In [2]:
# Import packages
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Load dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
print(train.shape)
train.head()

(900000, 33)


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,...,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,...,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


* id(int64): 0, 1, 2, ..., 899999 (ordered)
* f_00(float64): -4.59 ~ 4.74
* f_01(float64): -4.68 ~ 4.81
* f_02(float64): -4.64 ~ 4.96 
* f_03(float64): -4.65 ~ 4.45
* f_04(float64): -4.74 ~ 4.94
* f_05(float64): -4.75 ~ 4.97
* f_06(float64): -4.84 ~ 4.82
* f_07(int64): 0, 1, 2, ..., 15 (not ordered)
* f_08(int64): 0, 1, 2, ..., 16 (not ordered)
* f_09(int64): 0, 1, 2, ..., 14 (not ordered)
* f_10(int64): 0, 1, 2, ..., 14 (not ordered)
* f_11(int64): 0, 1, 2, ..., 13 (not ordered)
* f_12(int64): 0, 1, 2, ..., 16 (not ordered)
* f_13(int64): 0, 1, 2, ..., 12 (not ordered)
* f_14(int64): 0, 1, 2, ..., 14 (not ordered)
* f_15(int64): 0, 1, 2, ..., 14 (not ordered)
* f_16(int64): 0, 1, 2, ..., 15 (not ordered)
* f_17(int64): 0, 1, 2, ..., 14 (not ordered)
* f_18(int64): 0, 1, 2, ..., 13 (not ordered)
* f_19(float64): -11.28 ~ 12.07
* f_20(float64): -11.25 ~ 11.47
* f_21(float64): -13.31 ~ 14.45
* f_22(float64): -11.85 ~ 11.34
* f_23(float64): -12.30 ~ 12.24
* f_24(float64): -11.41 ~ 12.38
* f_25(float64): -11.91 ~ 12.52
* f_26(float64): -14.30 ~ 12.91
* f_27(object): 'AAAAAAABAB', ..., 'BMBDEADPAC' (-> 10 digits)
* f_28(float64): -1229.75 ~ 1229.56
* f_29(int64): 0, 1
* f_30(int64): 0, 1, 2
* target(int64): 0, 1

In [49]:
print(test.shape)
test.head()

(700000, 32)


Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30
0,900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,...,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,BAAABADLAC,99.478419,0,0
1,900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,...,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,AFABBAEGCB,-65.993825,1,0
2,900002,0.30399,2.44511,0.246515,0.818248,0.359731,-1.331845,1.358622,3,3,...,-7.026098,1.312277,-5.157192,1.714005,0.585032,0.066898,BBACABBKEE,-87.405622,0,1
3,900003,0.154053,0.260126,-1.367092,-0.093175,-1.111034,-0.948481,1.11922,0,0,...,-0.594532,-3.939475,1.75457,-2.364007,-1.00332,3.893099,AEBEAACQCC,-281.29346,0,0
4,900004,-1.651904,-0.424266,-0.667356,-0.322124,-0.089462,0.181705,1.784983,2,2,...,0.084906,-0.985736,-0.130467,-3.557893,1.210687,1.861884,AEBBBBDABF,25.629415,0,2


#### - Correlation

In [5]:
cor = round(train.corr(), 2)
# cor[((cor > 0.4) | (cor < -0.4)) & (cor != 1)].count()  # no correlation was found

## 2. Numerize *f_27* column

In [28]:
# Return 'n'th character
def nth_char(column, n):
    count = 1
    nth = []
    for characters in column:
        for char in characters:
            if count == n:
                nth.append(char)
                count = 1
                break
            else:
                count += 1

    return nth

In [49]:
# Split the letters and store in the new columns
f_27_01 = np.array(nth_char(train.f_27, 1))
f_27_02 = np.array(nth_char(train.f_27, 2))
f_27_03 = np.array(nth_char(train.f_27, 3))
f_27_04 = np.array(nth_char(train.f_27, 4))
f_27_05 = np.array(nth_char(train.f_27, 5))
f_27_06 = np.array(nth_char(train.f_27, 6))
f_27_07 = np.array(nth_char(train.f_27, 7))
f_27_08 = np.array(nth_char(train.f_27, 8))
f_27_09 = np.array(nth_char(train.f_27, 9))
f_27_10 = np.array(nth_char(train.f_27, 10))

In [53]:
# Create DataFrame
dff_27 = pd.DataFrame({"f_27_01":f_27_01, "f_27_02":f_27_02, "f_27_03":f_27_03, "f_27_04":f_27_04, 
                "f_27_05":f_27_05, "f_27_06":f_27_06, "f_27_07":f_27_07, "f_27_08":f_27_08, 
                "f_27_09":f_27_09, "f_27_10":f_27_10})

In [55]:
print(dff_27.shape)
dff_27.head()

(900000, 10)


Unnamed: 0,f_27_01,f_27_02,f_27_03,f_27_04,f_27_05,f_27_06,f_27_07,f_27_08,f_27_09,f_27_10
0,A,B,A,B,D,A,D,B,A,B
1,A,C,A,C,C,A,D,C,E,B
2,A,A,A,E,A,B,C,K,A,D
3,B,D,B,B,A,A,C,B,C,B
4,B,D,B,C,B,B,C,H,F,E


In [67]:
# Data Investigate
print("f_27_01: ", dff_27.f_27_01.unique())
print("f_27_02: ", dff_27.f_27_02.unique())
print("f_27_03: ", dff_27.f_27_03.unique())
print("f_27_04: ", dff_27.f_27_04.unique())
print("f_27_05: ", dff_27.f_27_05.unique())
print("f_27_06: ", dff_27.f_27_06.unique())
print("f_27_07: ", dff_27.f_27_07.unique())
print("f_27_08: ", dff_27.f_27_08.unique())
print("f_27_09: ", dff_27.f_27_09.unique())
print("f_27_10: ", dff_27.f_27_10.unique())

f_27_01:  ['A' 'B']
f_27_02:  ['B' 'C' 'A' 'D' 'E' 'G' 'F' 'H' 'I' 'J' 'K' 'L' 'M' 'N']
f_27_03:  ['A' 'B']
f_27_04:  ['B' 'C' 'E' 'D' 'H' 'I' 'G' 'A' 'F' 'J' 'K' 'L' 'M' 'N' 'O']
f_27_05:  ['D' 'C' 'A' 'B' 'F' 'E' 'G' 'I' 'H' 'J' 'K' 'L' 'M' 'O']
f_27_06:  ['A' 'B']
f_27_07:  ['D' 'C' 'A' 'E' 'B' 'G' 'H' 'F' 'I' 'J' 'M' 'K' 'L' 'N' 'O']
f_27_08:  ['B' 'C' 'K' 'H' 'E' 'P' 'D' 'A' 'S' 'T' 'J' 'Q' 'N' 'R' 'G' 'M' 'F' 'O'
 'I' 'L']
f_27_09:  ['A' 'E' 'C' 'F' 'D' 'H' 'B' 'G' 'I' 'J' 'L' 'K' 'M' 'N' 'O']
f_27_10:  ['B' 'D' 'E' 'A' 'C' 'K' 'F' 'G' 'H' 'J' 'I' 'L' 'M' 'N' 'O']


In [None]:
# Numerize the split dataframe


3. Split training dataset to *ourtrain* and *ourtest*.
4. Dimensionality reduction. (pca?)
5. Try various models(logistic regression, decision tree, random forest, SVM, KNN, Naive Bayes).
6. Apply to the test dataset.