In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"poratori","key":"c14ad3ceb870f21724fe342c2e95f1ff"}'}

# 모듈 호출 및 환경 세팅

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import f1_score

# statistical library for hypothesis testing
# from statsmodels.stats.weightstats import ztest

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/ 
!chmod 600 ~/.kaggle/kaggle.json 

# kaggle competition 
!kaggle competitions download -c kakr-4th-competition

Downloading test.csv to /content
  0% 0.00/683k [00:00<?, ?B/s]
100% 683k/683k [00:00<00:00, 93.9MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/43.4k [00:00<?, ?B/s]
100% 43.4k/43.4k [00:00<00:00, 38.9MB/s]
Downloading train.csv.zip to /content
  0% 0.00/399k [00:00<?, ?B/s]
100% 399k/399k [00:00<00:00, 128MB/s]


In [None]:
# !kaggle datasets list

In [None]:
!ls

kaggle.json  sample_data  sample_submission.csv  test.csv  train.csv.zip


In [None]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# 데이터 검증

In [None]:
# train['race'].value_counts()

In [None]:
# ax = sns.violinplot(y="capital_loss", x="income",data=train, palette="muted", split=True)

In [None]:
# ax = sns.violinplot(y="capital_loss",data=test, palette="muted")

# 데이터 전처리 수행

In [None]:
# 1) column 제거
def col_reduction(df):
    df.drop(['id','fnlwgt','education','relationship','native_country','workclass'], axis=1, inplace=True)
    
    return df

# 2) marital_status 조정
def mar_st(df):
    df['marital_status'] = (df['marital_status'] == 'Married-civ-spouse').astype(int)
    
    return df

# 3) capital_gain, loss 조정
def capital(df):
    df['cap_gain_high'] = (df['capital_gain'] != 0).astype(int)
    df['cap_loss_high'] = (df['capital_loss'] >= 1700).astype(int)
    df['capital_gain'] = df['capital_gain'].map(lambda x : np.log(x) if x != 0 else 0)
    df['capital_loss'] = df['capital_loss'].map(lambda x : np.log(x) if x != 0 else 0)
    
    return df

# 4) age 조정 함수
def age(df):
    df.loc[df['age'] < 20, 'age_range'] = '~20'
    df.loc[df['age'] >= 65, 'age_range'] = '~65'

    down = 20
    for i in range(45//5):
        df.loc[(df['age'] >= down) & (df['age'] < down+5), 'age_range'] = str(down)+'~'+str(down+5)
        down += 5

    df['age'] = df['age_range']
    df.drop(['age_range'], axis=1, inplace=True)
    
    return df
    
# 5) One-hot encoding은 만들지 않았다.

# 6) edu_num 새 변수 만들기
def edu(df):
    df['edu_num_high'] = (df['education_num'] >= 13).astype(int)
    
    return df

# 7) hpw 새 변수 만들기
    
def hpw(df):
    df['hpw_high'] = (df['hours_per_week'] >= 50).astype(int)

    return df

# 8) MinMaxScaler
def mm_feature(df, feature):
    mm_scaler = MinMaxScaler()
    
    df[feature] = mm_scaler.fit_transform(df[feature].values.reshape(-1,1))
    
    return df, mm_scaler

# 9) target 분리: train은 하고, test는 안하므로 따로 만들겠다.
def target_handle(df):
    df['income'] = df['income_>50K']
    df.drop(['income_>50K','income_<=50K'], axis=1, inplace=True)
    
    y_df = df.income
    X_df = df.drop(['income'], axis=1, inplace=False)
    
    return X_df, y_df

def main(df):
    
    df1 = col_reduction(df)
    df2 = mar_st(df1)
    df3 = capital(df2)
    df4 = age(df3)
    
    df5 = pd.get_dummies(df4)
    
    df6 = edu(df5)
    df_fin = hpw(df6)
    
    return df_fin

In [None]:
train.head(3)
train.head(3)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K


In [None]:
# 적용
## main: 1) ~ 7)
train = main(train)
X_test = main(test)

## 8) minmax scaler
train, mm_scaler1 = mm_feature(train,'education_num')
train, mm_scaler2 = mm_feature(train,'hours_per_week')

X_test['education_num'] = mm_scaler1.transform(X_test['education_num'].values.reshape(-1,1))
X_test['hours_per_week'] = mm_scaler2.transform(X_test['hours_per_week'].values.reshape(-1,1))

## 9) X, y split
X_train, y_train = target_handle(train)

In [None]:
minmax_scaler = MinMaxScaler()

X_train['capital_gain'] = minmax_scaler.fit_transform(X_train['capital_gain'].values.reshape(-1,1))
X_train['capital_loss'] = minmax_scaler.fit_transform(X_train['capital_loss'].values.reshape(-1,1))

X_test['capital_gain'] = minmax_scaler.fit_transform(X_test['capital_gain'].values.reshape(-1,1))
X_test['capital_loss'] = minmax_scaler.fit_transform(X_test['capital_loss'].values.reshape(-1,1))

# 데이터 검증

In [None]:
X_train.head(3)

Unnamed: 0,education_num,marital_status,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,age_20~25,age_25~30,age_30~35,age_35~40,age_40~45,age_45~50,age_50~55,age_55~60,age_60~65,age_~20,age_~65,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,edu_num_high,hpw_high
0,0.533333,1,0.0,0.0,0.602041,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,1
1,0.266667,0,0.0,0.0,0.193878,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,0.6,0,0.0,0.0,0.153061,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [None]:
X_train.describe()

Unnamed: 0,education_num,marital_status,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,age_20~25,age_25~30,age_30~35,age_35~40,age_40~45,age_45~50,age_50~55,age_55~60,age_60~65,age_~20,age_~65,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,edu_num_high,hpw_high
count,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0,26049.0
mean,0.605891,0.459519,0.064561,0.042147,0.402481,0.084187,0.034358,0.120542,0.128565,0.132711,0.131329,0.118738,0.101079,0.077815,0.056317,0.040808,0.050712,0.041384,0.057891,0.114515,0.000269,0.125226,0.124611,0.030097,0.041921,0.060732,0.10081,0.004722,0.126838,0.020039,0.115091,0.028485,0.048754,0.009751,0.030711,0.095052,0.007831,0.856655,0.32888,0.67112,0.249683,0.200084
std,0.171174,0.498368,0.214434,0.189775,0.126141,0.277674,0.182151,0.325601,0.334725,0.339269,0.337767,0.323486,0.301439,0.267885,0.230537,0.197848,0.219413,0.199179,0.233542,0.318442,0.016391,0.330981,0.330284,0.170858,0.200412,0.238842,0.301083,0.068555,0.332798,0.140137,0.319138,0.166356,0.215358,0.098266,0.172538,0.293292,0.08815,0.350431,0.469815,0.469815,0.432838,0.400071
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.533333,0.0,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.6,0.0,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,0.733333,1.0,0.0,0.0,0.44898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_test.head(3)

Unnamed: 0,education_num,marital_status,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,age_20~25,age_25~30,age_30~35,age_35~40,age_40~45,age_45~50,age_50~55,age_55~60,age_60~65,age_~20,age_~65,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,edu_num_high,hpw_high
0,0.6,0,0.0,0.0,0.397959,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,0.533333,1,0.0,0.0,0.5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
2,0.6,0,0.0,0.0,0.244898,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [None]:
X_test.describe()

Unnamed: 0,education_num,marital_status,capital_gain,capital_loss,hours_per_week,cap_gain_high,cap_loss_high,age_20~25,age_25~30,age_30~35,age_35~40,age_40~45,age_45~50,age_50~55,age_55~60,age_60~65,age_~20,age_~65,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,edu_num_high,hpw_high
count,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0
mean,0.603327,0.461609,0.060788,0.040424,0.402192,0.079699,0.034705,0.118704,0.121622,0.135289,0.131143,0.12024,0.102273,0.080928,0.060964,0.037623,0.051597,0.039619,0.051443,0.120854,0.000307,0.128532,0.125921,0.032248,0.04269,0.064496,0.102733,0.003993,0.128378,0.019502,0.100123,0.028563,0.050215,0.008753,0.036701,0.099509,0.010289,0.844748,0.338452,0.661548,0.240018,0.191953
std,0.172869,0.498562,0.208234,0.186345,0.125413,0.270847,0.183046,0.323465,0.326873,0.342058,0.337582,0.325266,0.30303,0.272745,0.239283,0.190297,0.221229,0.195078,0.220918,0.325982,0.017524,0.334707,0.331786,0.176672,0.202174,0.245654,0.303634,0.063066,0.334536,0.138293,0.300187,0.166587,0.218405,0.093155,0.188042,0.299367,0.100918,0.362172,0.473219,0.473219,0.427127,0.393867
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.533333,0.0,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.6,0.0,0.0,0.0,0.397959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,0.733333,1.0,0.0,0.0,0.44898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 모델링 수행

In [None]:
num_test = 0.20
# X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, test_size=num_test, random_state=23)
X1_train, X1_test, y1_train, y1_test = train_test_split(X_train, y_train, test_size=num_test)

In [None]:
print(len(X1_train))
print(len(X1_test))
print(len(y1_train))
print(len(y1_test))

20839
5210
20839
5210


In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X1_train, y1_train)
y_pred = logreg.predict(X1_test)
print(f1_score(y1_test, y_pred, average='micro'))
# acc_logreg = round(f1_score(y1_test, y_pred, average='weighted') * 100, 2)
# print(f1_score)

0.8493282149712092


In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X1_train, y1_train)
y_pred = svc.predict(X1_test)
print(f1_score(y1_test, y_pred, average='micro'))
# acc_logreg = round(f1_score(y1_test, y_pred, average='weighted') * 100, 2)
# print(f1_score)

0.8470249520153551


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradientboostingclassifier = GradientBoostingClassifier()
gradientboostingclassifier.fit(X1_train, y1_train)
y_pred = gradientboostingclassifier.predict(X1_test)
print(f1_score(y1_test, y_pred, average='micro'))
# acc_logreg = round(f1_score(y1_test, y_pred, average='weighted') * 100, 2)
# print(f1_score)

0.861996161228407


In [None]:
from sklearn.ensemble import RandomForestClassifier

randomforestclassifier = RandomForestClassifier()
randomforestclassifier.fit(X1_train, y1_train)
y_pred = randomforestclassifier.predict(X1_test)
print(f1_score(y1_test, y_pred, average='micro'))

# acc_logreg = round(f1_score(y1_test, y_pred, average='weighted') * 100, 2)
# print(f1_score)

0.8516314779270632


In [None]:
from sklearn.naive_bayes import GaussianNB

gaussianNB = GaussianNB()
gaussianNB.fit(X1_train, y1_train)
y_pred = gaussianNB.predict(X1_test)
print(f1_score(y1_test, y_pred, average='micro'))
# acc_logreg = round(f1_score(y1_test, y_pred, average='weighted') * 100, 2)
# print(f1_score)

0.5357005758157389


In [None]:
len(X_train.columns)

42

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(len(X_train.columns),)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X1_train, y1_train,epochs=10, batch_size=4, verbose=1)

y_pred = model.predict_classes(X1_test)

print(f1_score(y1_test, y_pred, average='micro'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
0.8523992322456814


In [None]:
# seed = 7
# np.random.seed(seed)

# kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# # k = 5인 KFold와 Fold별 정확도를 담을 list 생성
# kfold = KFold(n_splits=5) # default = 3

# n_iter = 0

# # KFold.split( ) 호출: Fold 별 학습, 검증 Data의 row index를 array로 반환  
# for train_index, test_index in kfold.split(X_train):
    
#     # kfold.split( )으로 반환된 인덱스를 이용하여 학습용, 검증용 테스트 데이터 추출
#     X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
#     y_train_, y_val = y_train[train_index], y_train[test_index]
    
#     # 학습 및 예측 
#     svm_clf3.fit(X_train_ , y_train_)    
#     pred = svm_clf3.predict(X_val)
#     n_iter += 1
    
#     # 반복 시 마다 정확도 측정 
#     accuracy = np.round(f1_score(y_val, pred, average='micro'), 4)
#     train_size = X_train_.shape[0]
#     test_size = X_val.shape[0]
#     print('\n#{0} 교차 검증 정확도 :{1}, 학습 데이터 크기: {2}, 검증 데이터 크기: {3}'
#           .format(n_iter, accuracy, train_size, test_size))
#     cv_accuracy.append(accuracy)
    
# # 개별 iteration별 정확도를 합하여 평균 정확도 계산 
# print('\n## 평균 검증 정확도:', np.mean(cv_accuracy))

# 훈련 및 테스트 수행

In [None]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, activation='relu', input_shape=(len(X_train.columns),)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train, y_train,epochs=10, batch_size=4, verbose=1)

y_pred = model.predict_classes(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
sample_submission.head(3)

Unnamed: 0,id,prediction
0,0,0
1,1,0
2,2,0


In [None]:
submission = pd.DataFrame(y_pred).reset_index()
submission.columns = ['id','prediction']
submission.to_csv('submission.csv',index=False)