In [3]:
# Load Data

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

train = pd.read_csv('PJT002_train.csv', parse_dates=["dt_of_athrztn"])
val = pd.read_csv('PJT002_validation.csv', parse_dates=["dt_of_athrztn"])
test = pd.read_csv('PJT002_test.csv', parse_dates=["dt_of_athrztn"])
sub = pd.read_csv('PJT002_submission.csv')

train.head()

print(train.shape)
print(val.shape)
print(test.shape)
print(sub.shape)

train.columns.values

val.columns.values

list(train.columns.values).sort() == list(val.columns.values).sort()

train.isna().sum()

train.info(True)

binary_y = {'N': 0, 'Y': 1}

train['fr_yn'] = train['fr_yn'].map(binary_y)
val['fr_yn'] = val['fr_yn'].map(binary_y)



# Preprocessing

## Part 1

### bldng_us 건물용도

train["bldng_us"].head()

train["bldng_us"].unique()

test["bldng_us"].unique()

train[train["bldng_us"].isnull()].shape

test[test["bldng_us"].isnull()].shape

pd.value_counts(train["bldng_us"].values.flatten())

pd.value_counts(test["bldng_us"].values.flatten())

pd.pivot_table(train, index="bldng_us", values="fr_yn")

pd.value_counts(train[train["bldng_us"]=='단독주택']["bldng_archtctr"].values.flatten())









#bldng_us_columns = set(train["bldng_us"].unique()).intersection(set(test["bldng_us"].unique())).intersection(set(val["bldng_us"].unique()))



#bldng_us_columns = list(train["bldng_us"].unique())

#for b in bldng_us_columns:
#    if (train["bldng_us"].values == b).sum() < 150:
#        train.loc[train["bldng_us"].values == b, "bldng_us"] = None
#        val.loc[val["bldng_us"].values == b, "bldng_us"] = None
#        test.loc[test["bldng_us"].values == b, "bldng_us"] = None

train["bldng_us"].value_counts()

pd.pivot_table(train, index="bldng_us", values="fr_yn")

# 근린생활시설은 1종,2종 간 용도 차이가 너무 커 제외한다. (val로 검증 완료)

train.loc[train["bldng_us"].values == "제1종근린생활시설", "bldng_us"] = None
train.loc[train["bldng_us"].values == "제2종근린생활시설", "bldng_us"] = None


val.loc[val["bldng_us"].values == "제1종근린생활시설", "bldng_us"] = None
val.loc[val["bldng_us"].values == "제2종근린생활시설", "bldng_us"] = None


test.loc[test["bldng_us"].values == "제1종근린생활시설", "bldng_us"] = None
test.loc[test["bldng_us"].values == "제2종근린생활시설", "bldng_us"] = None


one_hot_bldng_us = pd.get_dummies(train["bldng_us"])
train = train.join(one_hot_bldng_us)

one_hot_bldng_us = pd.get_dummies(val["bldng_us"])
val = val.join(one_hot_bldng_us)

one_hot_bldng_us = pd.get_dummies(test["bldng_us"])
test = test.join(one_hot_bldng_us)

#원랜해야하는데 하니까 점수 더 내려감..
#train = train.drop("bldng_us", 1)
#val = val.drop("bldng_us", 1)
#test = test.drop("bldng_us", 1)





train.columns.values

list(train.columns.values).sort() == list(val.columns.values).sort()

for a in train.columns.values:
    if a not in list(test.columns.values):
        print(a)









### bldng_archtctr 건물구조

train["bldng_archtctr"].head()

train["bldng_archtctr"].unique()

test["bldng_archtctr"].unique()

for i in train["bldng_archtctr"].unique():
    if i not in test["bldng_archtctr"].unique():
        print(i)

for i in train["bldng_archtctr"].unique():
    if i not in val["bldng_archtctr"].unique():
        print(i)

print(train[train["bldng_archtctr"].isnull()].shape)
print(val[val["bldng_archtctr"].isnull()].shape)
print(test[test["bldng_archtctr"].isnull()].shape)

pd.value_counts(train["bldng_archtctr"].values.flatten())

pd.pivot_table(train, index="bldng_archtctr", values="fr_yn")

A = ["목구조", "일반목구조", "벽돌구조", "블록구조", "석구조", "조적구조"]
B = ["강파이프구조", "경량철골구조", "기타강구조", "기타구조", "기타조적구조"]
C = ["일반철골구조", "철골철근콘크리트구조", "철골콘크리트구조", "철근콘크리트구조"]

for i in A:
    train.loc[train["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "A"

for i in B:
    train.loc[train["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "B"
    
for i in C:
    train.loc[train["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "C"

    
for i in A:
    val.loc[val["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "A"

for i in B:
    val.loc[val["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "B"
    
for i in C:
    val.loc[val["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "C"


for i in A:
    test.loc[test["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "A"

for i in B:
    test.loc[test["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "B"
    
for i in C:
    test.loc[test["bldng_archtctr"].values == i, "bldng_archtctr_encoded"] = "C"

















pd.pivot_table(train, index="bldng_archtctr_encoded", values="fr_yn")

#bldng_archtctr_columns = list(train["bldng_archtctr"].unique())

#for b in bldng_archtctr_columns:
#    if (train["bldng_archtctr"].values == b).sum() < 150:
#        train.loc[train["bldng_archtctr"].values == b, "bldng_archtctr"] = None
#        val.loc[val["bldng_archtctr"].values == b, "bldng_archtctr"] = None
#        test.loc[test["bldng_us"].values == b, "bldng_archtctr"] = None

one_hot_bldng_archtctr = pd.get_dummies(train["bldng_archtctr_encoded"])
train = train.join(one_hot_bldng_archtctr)

one_hot_bldng_archtctr = pd.get_dummies(val["bldng_archtctr_encoded"])
val = val.join(one_hot_bldng_archtctr)

one_hot_bldng_archtctr = pd.get_dummies(test["bldng_archtctr_encoded"])
test = test.join(one_hot_bldng_archtctr)


train = train.drop("bldng_archtctr", 1)
val = val.drop("bldng_archtctr", 1)
test = test.drop("bldng_archtctr", 1)

train = train.drop("bldng_archtctr_encoded", 1)
val = val.drop("bldng_archtctr_encoded", 1)
test = test.drop("bldng_archtctr_encoded", 1)

### bldng_cnt 건물채수

train["bldng_cnt"].head()

train["bldng_cnt"].value_counts()

train["bldng_cnt"].describe()

bldng_cnt_under10 = train[train["bldng_cnt"]<10] 
bldng_cnt_under10["bldng_cnt"].describe()

bldng_cnt_under10["fr_yn"].describe()

plt.hist(bldng_cnt_under10["bldng_cnt"], rwidth=0.9)

sns.countplot(data=bldng_cnt_under10, x="bldng_cnt",hue="fr_yn")

pd.pivot_table(bldng_cnt_under10, index="bldng_cnt", values="fr_yn")

bldng_cnt_under20 = train[train["bldng_cnt"]<20]
bldng_cnt_under20["bldng_cnt"].describe()

sns.countplot(data=bldng_cnt_under20, x="bldng_cnt",hue="fr_yn")

pd.pivot_table(bldng_cnt_under20, index="bldng_cnt", values="fr_yn")

bldng_cnt_under50 = train[train["bldng_cnt"]<50]
bldng_cnt_under50["bldng_cnt"].describe()

pd.pivot_table(bldng_cnt_under50, index="bldng_cnt", values="fr_yn")





bldng_cnt_over50_under250 = train[(train["bldng_cnt"]>50)&(train["bldng_cnt"]<250)]
bldng_cnt_over50_under250["bldng_cnt"].describe()

bldng_cnt_over50_under250["fr_yn"].describe()

plt.scatter(bldng_cnt_over50_under250["bldng_cnt"], bldng_cnt_over50_under250["fr_yn"], s=50)

bldng_cnt_over50_under75 = train[(train["bldng_cnt"]>50)&(train["bldng_cnt"]<75)]
bldng_cnt_over50_under75["fr_yn"].describe()

train.loc[train["bldng_cnt"]<11, "bldng_cnt_encoded"] = "small"
train.loc[(train["bldng_cnt"]>10)&(train["bldng_cnt"]<75), "bldng_cnt_encoded"] = "middle"
train.loc[train["bldng_cnt"]>74, "bldng_cnt_encoded"] = "big"

sns.countplot(data=train, x="bldng_cnt_encoded",hue="fr_yn")



one_hot_bldng_cnt_encoded = pd.get_dummies(train["bldng_cnt_encoded"])
train = train.join(one_hot_bldng_cnt_encoded)

val.loc[val["bldng_cnt"]<11, "bldng_cnt_encoded"] = "small"
val.loc[(val["bldng_cnt"]>10)&(val["bldng_cnt"]<75), "bldng_cnt_encoded"] = "middle"
val.loc[val["bldng_cnt"]>74, "bldng_cnt_encoded"] = "big"

one_hot_bldng_cnt_encoded = pd.get_dummies(val["bldng_cnt_encoded"])
val = val.join(one_hot_bldng_cnt_encoded)

test.loc[val["bldng_cnt"]<11, "bldng_cnt_encoded"] = "small"
test.loc[(test["bldng_cnt"]>10)&(test["bldng_cnt"]<75), "bldng_cnt_encoded"] = "middle"
test.loc[test["bldng_cnt"]>74, "bldng_cnt_encoded"] = "big"

one_hot_bldng_cnt_encoded = pd.get_dummies(test["bldng_cnt_encoded"])
test = test.join(one_hot_bldng_cnt_encoded)

train = train.drop("bldng_cnt", 1)
val = val.drop("bldng_cnt", 1)
test = test.drop("bldng_cnt", 1)

train = train.drop("bldng_cnt_encoded", 1)
val = val.drop("bldng_cnt_encoded", 1)
test = test.drop("bldng_cnt_encoded", 1)



### bldng_ar & ttl_ar & lnd_ar 건물건축면적, 건물연면적, 토지면적

# val 값이 전처리 하기 전이 조금 더 잘나옴

train["bldng_ar"].head(10)

print(train[train["bldng_ar"].isnull()].shape)
print(val[val["bldng_ar"].isnull()].shape)
print(test[test["bldng_ar"].isnull()].shape)

print(train[train["bldng_ar"]==0].shape)
print(val[val["bldng_ar"]==0].shape)
print(test[test["bldng_ar"]==0].shape)

print(train[train["ttl_ar"]==0].shape)
print(val[val["ttl_ar"]==0].shape)
print(test[test["ttl_ar"]==0].shape)

print(train[train["lnd_ar"]==0].shape)
print(val[val["lnd_ar"]==0].shape)
print(test[test["lnd_ar"]==0].shape)





bldng_ar_notnull = train[train["bldng_ar"]!=0]
sns.lmplot(data=bldng_ar_notnull, x="bldng_ar", y="ttl_ar", hue="fr_yn", fit_reg=False)

low_bldng_ar = train[(train["bldng_ar"]<4000)&(train["bldng_ar"]>0)&(train["ttl_ar"]<3000)&(train["ttl_ar"]>0)]
low_bldng_ar.shape

sns.lmplot(data=low_bldng_ar, x="bldng_ar", y="ttl_ar", hue="fr_yn", fit_reg=False)

low_bldng_ar = train[(train["bldng_ar"]<4000)&(train["bldng_ar"]>0)
                     &(train["ttl_ar"]<3000)&(train["ttl_ar"]>0)
                     &(train["lnd_ar"]<4000)&(train["lnd_ar"]>0)]

sns.lmplot(data=low_bldng_ar, x="bldng_ar", y="lnd_ar", hue="fr_yn", fit_reg=False, size=10)

sns.lmplot(data=low_bldng_ar, x="ttl_ar", y="lnd_ar", hue="fr_yn", fit_reg=False, size=10)

low_low_bldng_ar = train[(train["bldng_ar"]<1500)&(train["bldng_ar"]>0)
                     &(train["ttl_ar"]<2000)&(train["ttl_ar"]>0)
                     &(train["lnd_ar"]<2000)&(train["lnd_ar"]>0)]
low_low_bldng_ar.shape

sns.lmplot(data=low_low_bldng_ar, x="bldng_ar", y="ttl_ar", hue="fr_yn", fit_reg=False, size=10)



















train[train["bldng_ar"]==0].shape

train["bldng_ar"].describe()

train["ttl_ar"].mean()

train["ttl_ar"].median()

train.loc[train["bldng_ar"]==0, "bldng_ar"] = train["bldng_ar"].median()
train.loc[train["ttl_ar"]==0, "ttl_ar"] = train["ttl_ar"].median()

val.loc[val["bldng_ar"]==0, "bldng_ar"] = val["bldng_ar"].median()
val.loc[val["ttl_ar"]==0, "ttl_ar"] = val["ttl_ar"].median()

test.loc[test["bldng_ar"]==0, "bldng_ar"] = test["bldng_ar"].median()
test.loc[test["ttl_ar"]==0, "ttl_ar"] = test["ttl_ar"].median()


#train["floors"] = train["ttl_ar"] / train["bldng_ar"]
#train["floors"].head()





train = train.drop("lnd_ar", 1)
val = val.drop("lnd_ar", 1)
test = test.drop("lnd_ar", 1)



### dt_of_athrztn 건물승인날짜

train["dt_of_athrztn"].head()

train["year_athrztn"] = train["dt_of_athrztn"].str[:4]
val["year_athrztn"] = val["dt_of_athrztn"].str[:4]
test["year_athrztn"]=test["dt_of_athrztn"].str[:4]

year_athrztn_notnull = train[(train["year_athrztn"]!="nan")]
year_athrztn_YYYY = year_athrztn_notnull[year_athrztn_notnull["year_athrztn"].astype(int)<3000]
year_athrztn_YY = year_athrztn_notnull[year_athrztn_notnull["year_athrztn"].astype(int)>3000]
year_athrztn_null = train[train["year_athrztn"]=="nan"]

print(year_athrztn_YYYY.shape)
print(year_athrztn_YY.shape)
print(year_athrztn_null.shape)

year_athrztn_YYYY["year_athrztn"].head()

year_athrztn_YY["year_athrztn"].head()

year_athrztn_YY["year_athrztn"] = 1900 + year_athrztn_YY["dt_of_athrztn"].str[:2].astype(int)



year_athrztn_notnull = year_athrztn_YYYY.append(year_athrztn_YY)
year_athrztn_median=year_athrztn_notnull["year_athrztn"].median()
year_athrztn_median

year_athrztn_null["year_athrztn"]=year_athrztn_median

train = year_athrztn_notnull.append(year_athrztn_null)
train.shape

val.loc[val["year_athrztn"]=="nan","year_athrztn"]= val[val["year_athrztn"]!="nan"]["year_athrztn"].median()
test.loc[test["year_athrztn"]=="nan","year_athrztn"]= test[test["year_athrztn"]!="nan"]["year_athrztn"].median()

train["year_athrztn"] = train["year_athrztn"].astype(int)
val["year_athrztn"] = val["year_athrztn"].astype(int)
test["year_athrztn"] = test["year_athrztn"].astype(int)


train.reset_index(inplace=True)
train.drop('index',inplace=True,axis=1)




train.loc[train["year_athrztn"]<2000,"year_athrztn_encoded"] = "old"
train.loc[train["year_athrztn"]>1999,"year_athrztn_encoded"] = "new"

val.loc[val["year_athrztn"]<2000,"year_athrztn_encoded"] = "old"
val.loc[val["year_athrztn"]>1999,"year_athrztn_encoded"] = "new"

test.loc[test["year_athrztn"]<2000,"year_athrztn_encoded"] = "old"
test.loc[test["year_athrztn"]>1999,"year_athrztn_encoded"] = "new"

pd.pivot_table(train, index="year_athrztn_encoded", values="fr_yn")



train = train.drop("dt_of_athrztn", 1)
val = val.drop("dt_of_athrztn", 1)
test = test.drop("dt_of_athrztn", 1)



## part 3

### ele_energy_us_YYYYMM 전기 에너지 사용량 (YYYY년 M월)



























## part5

### trgt_cnt 소방점검대상물기준

train[train["trgt_crtr"].notnull()].shape

pd.value_counts(train["trgt_crtr"].values.flatten())

pd.pivot_table(train, index="trgt_crtr", values="fr_yn")

pd.value_counts(test["trgt_crtr"].values.flatten())

pd.value_counts(val["trgt_crtr"].values.flatten())

pd.value_counts(test["trgt_crtr"].values.flatten())

train.loc[train["trgt_crtr"]=="자동화재탐지설치대상", "auto_fr"] = 1
train.loc[train["trgt_crtr"]!="자동화재탐지설치대상", "auto_fr"] = 0
train.loc[train["trgt_crtr"]=="옥내소화전설치대상", "fireplug"] = 1
train.loc[train["trgt_crtr"]!="옥내소화전설치대상", "fireplug"] = 0
train.loc[train["trgt_crtr"]=="스프링클러,물분무등설치대상", "sprinkler"] = 1
train.loc[train["trgt_crtr"]!="스프링클러,물분무등설치대상", "sprinkler"] = 0


val.loc[val["trgt_crtr"]=="자동화재탐지설치대상", "auto_fr"] = 1
val.loc[val["trgt_crtr"]!="자동화재탐지설치대상", "auto_fr"] = 0
val.loc[val["trgt_crtr"]=="옥내소화전설치대상", "fireplug"] = 1
val.loc[val["trgt_crtr"]!="옥내소화전설치대상", "fireplug"] = 0
val.loc[val["trgt_crtr"]=="스프링클러,물분무등설치대상", "sprinkler"] = 1
val.loc[val["trgt_crtr"]!="스프링클러,물분무등설치대상", "sprinkler"] = 0

test.loc[test["trgt_crtr"]=="자동화재탐지설치대상", "auto_fr"] = 1
test.loc[test["trgt_crtr"]!="자동화재탐지설치대상", "auto_fr"] = 0
test.loc[test["trgt_crtr"]=="옥내소화전설치대상", "fireplug"] = 1
test.loc[test["trgt_crtr"]!="옥내소화전설치대상", "fireplug"] = 0
test.loc[test["trgt_crtr"]=="스프링클러,물분무등설치대상", "sprinkler"] = 1
test.loc[test["trgt_crtr"]!="스프링클러,물분무등설치대상", "sprinkler"] = 0



train = train.drop("trgt_crtr", 1)
val = val.drop("trgt_crtr", 1)
test = test.drop("trgt_crtr", 1)



### fr_fghtng_fclt_spcl_css_5_yn & fr_fghtng_fclt_spcl_css_6_yn 소방시설특례 5,6호 여부

#공란, N, NA로 구성

train[train["fr_fghtng_fclt_spcl_css_5_yn"].isnull()].shape

train[train["fr_fghtng_fclt_spcl_css_5_yn"].notnull()].shape

train[train["fr_fghtng_fclt_spcl_css_5_yn"]=="N"].shape

test[test["fr_fghtng_fclt_spcl_css_5_yn"].isnull()].shape

test[test["fr_fghtng_fclt_spcl_css_5_yn"].notnull()].shape

train.loc[train["fr_fghtng_fclt_spcl_css_5_yn"].isnull(),"css_5_yn_encoded"]=0
train.loc[train["fr_fghtng_fclt_spcl_css_5_yn"].notnull(),"css_5_yn_encoded"]=1

pd.pivot_table(train, index="css_5_yn_encoded", values="fr_yn")

train.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].isnull(),"css_6_yn_encoded"]=0
train.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].notnull(),"css_6_yn_encoded"]=1


pd.pivot_table(train, index="css_6_yn_encoded", values="fr_yn")

val.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].isnull(),"css_6_yn_encoded"]=0
val.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].notnull(),"css_6_yn_encoded"]=1

test.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].isnull(),"css_6_yn_encoded"]=0
test.loc[train["fr_fghtng_fclt_spcl_css_6_yn"].notnull(),"css_6_yn_encoded"]=1



train = train.drop("fr_fghtng_fclt_spcl_css_5_yn", 1)
val = val.drop("fr_fghtng_fclt_spcl_css_5_yn", 1)
test = test.drop("fr_fghtng_fclt_spcl_css_5_yn", 1)

train = train.drop("fr_fghtng_fclt_spcl_css_6_yn", 1)
val = val.drop("fr_fghtng_fclt_spcl_css_6_yn", 1)
test = test.drop("fr_fghtng_fclt_spcl_css_6_yn", 1)

train = train.drop("css_5_yn_encoded", 1)



### us_yn 사용 여부

print(train[train["us_yn"].isnull()].shape)
print(train[train["us_yn"].notnull()].shape)
print(test[test["us_yn"].isnull()].shape)
print(test[test["us_yn"].notnull()].shape)

pd.pivot_table(train, index="us_yn", values="fr_yn")

pd.value_counts(train["us_yn"].values.flatten())

pd.value_counts(test["us_yn"].values.flatten())

one_hot_us_yn = pd.get_dummies(train["us_yn"])
train = train.join(one_hot_us_yn)

one_hot_us_yn = pd.get_dummies(val["us_yn"])
val = val.join(one_hot_us_yn)

one_hot_us_yn = pd.get_dummies(test["us_yn"])
test = test.join(one_hot_us_yn)

train = train.drop("us_yn", 1)
val = val.drop("us_yn", 1)
test = test.drop("us_yn", 1)

### dngrs_thng_yn 위험물대상여부

print(train[train["dngrs_thng_yn"].isnull()].shape)
print(train[train["dngrs_thng_yn"].notnull()].shape)
print(test[test["dngrs_thng_yn"].isnull()].shape)
print(test[test["dngrs_thng_yn"].notnull()].shape)

pd.value_counts(train["dngrs_thng_yn"].values.flatten())

pd.value_counts(test["dngrs_thng_yn"].values.flatten())

pd.pivot_table(train, index="dngrs_thng_yn", values="fr_yn")

train.loc[train["dngrs_thng_yn"].isnull(),"dngrs_thng_yn_encoded"]= 0
train.loc[train["dngrs_thng_yn"].notnull(),"dngrs_thng_yn_encoded"]= 1


pd.pivot_table(train, index="dngrs_thng_yn_encoded", values="fr_yn")

val.loc[train["dngrs_thng_yn"].isnull(),"dngrs_thng_yn_encoded"]= 0
val.loc[train["dngrs_thng_yn"].notnull(),"dngrs_thng_yn_encoded"]= 1

test.loc[train["dngrs_thng_yn"].isnull(),"dngrs_thng_yn_encoded"]= 0
test.loc[train["dngrs_thng_yn"].notnull(),"dngrs_thng_yn_encoded"]= 1

train = train.drop("dngrs_thng_yn", 1)
val = val.drop("dngrs_thng_yn", 1)
test = test.drop("dngrs_thng_yn", 1)



### slf_fr_brgd_yn 자체소방대여부

print(train[train["slf_fr_brgd_yn"].isnull()].shape)
print(train[train["slf_fr_brgd_yn"].notnull()].shape)
print(test[test["slf_fr_brgd_yn"].isnull()].shape)
print(test[test["slf_fr_brgd_yn"].notnull()].shape)

pd.value_counts(train["slf_fr_brgd_yn"].values.flatten())

pd.value_counts(test["slf_fr_brgd_yn"].values.flatten())

train.loc[train["slf_fr_brgd_yn"].isnull(),"slf_fr_brgd_yn"]=0

pd.pivot_table(train, index="slf_fr_brgd_yn", values="fr_yn")

train = train.drop("slf_fr_brgd_yn", 1)
val = val.drop("slf_fr_brgd_yn", 1)
test = test.drop("slf_fr_brgd_yn", 1)



### blk_dngrs_thng_mnfctr_yn 대량위험물제조소등여부

print(train[train["blk_dngrs_thng_mnfctr_yn"].isnull()].shape)
print(train[train["blk_dngrs_thng_mnfctr_yn"].notnull()].shape)
print(test[test["blk_dngrs_thng_mnfctr_yn"].isnull()].shape)
print(test[test["blk_dngrs_thng_mnfctr_yn"].notnull()].shape)

pd.value_counts(train["blk_dngrs_thng_mnfctr_yn"].values.flatten())

pd.value_counts(test["blk_dngrs_thng_mnfctr_yn"].values.flatten())

train.loc[train["blk_dngrs_thng_mnfctr_yn"].isnull(),"blk_dngrs_thng_mnfctr_yn"]=0
pd.pivot_table(train, index="blk_dngrs_thng_mnfctr_yn", values="fr_yn")

train = train.drop("blk_dngrs_thng_mnfctr_yn", 1)
val = val.drop("blk_dngrs_thng_mnfctr_yn", 1)
test = test.drop("blk_dngrs_thng_mnfctr_yn", 1)



### cltrl_hrtg_yn 문화재여부

print(train[train["cltrl_hrtg_yn"].isnull()].shape)
print(train[train["cltrl_hrtg_yn"].notnull()].shape)
print(test[test["cltrl_hrtg_yn"].isnull()].shape)
print(test[test["cltrl_hrtg_yn"].notnull()].shape)

pd.value_counts(train["cltrl_hrtg_yn"].values.flatten())

pd.value_counts(test["cltrl_hrtg_yn"].values.flatten())

train = train.drop("cltrl_hrtg_yn", 1)
val = val.drop("cltrl_hrtg_yn", 1)
test = test.drop("cltrl_hrtg_yn", 1)



# Train





X_train = train.drop(['fr_yn', 'dt_of_fr'], 1)
y_train = train['fr_yn']
X_val = val.drop(['fr_yn', 'dt_of_fr'], 1)
y_val = val['fr_yn']
test = test.drop(['dt_of_fr'], 1)

df_all = pd.concat([X_train, X_val, test])

categorical_cols = df_all.select_dtypes(['object']).columns
for col in categorical_cols:
    df_all[col] = pd.Categorical(df_all[col]).codes

X_train = df_all[:len(train)]
X_val = df_all[len(train):-len(test)]
test = df_all[-len(test):]

X_train = X_train.fillna(-1)
X_val = X_val.fillna(-1)
test = test.fillna(-1)

from  sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=-1, n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

f1_score(y_val, y_pred)















