In [2]:
import pandas as pd
from datetime import datetime

In [3]:
# 学習用データと評価用データを読み込む
train_org = pd.read_csv('../data/smfg_train.csv', index_col=0)
test_org = pd.read_csv('../data/smfg_test.csv', index_col=0)

In [4]:
# train由来は0、test由来は1とする列を追加
train_org['is_test'] = 0
test_org['is_test'] = 1

# ユニオン前にカラムを揃える
# testには目的変数（health）が含まれていないので、ダミー変数を追加
test_org['health'] = -1  # 仮の値を割り当て

# trainとtestをユニオン（結合）
dataset = pd.concat([train_org, test_org], ignore_index=True)
dataset[['created_at', 'health', 'tree_dbh', 'steward', 'spc_common', 'nta_name', 'is_test']].head()

Unnamed: 0,created_at,health,tree_dbh,steward,spc_common,nta_name,is_test
0,2015-06-29,1,14,,English oak,Douglas Manor-Douglaston-Little Neck,0
1,2016-09-21,1,5,3or4,crimson king maple,Bedford Park-Fordham North,0
2,2015-09-13,2,26,,English oak,Annadale-Huguenot-Prince's Bay-Eltingville,0
3,2016-05-09,0,15,,honeylocust,Charleston-Richmond Valley-Tottenville,0
4,2016-06-24,1,23,,London planetree,Central Harlem North-Polo Grounds,0


### ここから

In [25]:
df = dataset.copy()
df.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_common,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,English oak,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,0
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,crimson king maple,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,0
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,English oak,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,0
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,honeylocust,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,0
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,London planetree,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,0


In [26]:
df['health'].value_counts()


health
-1    19702
 1    15751
 0     3535
 2      698
Name: count, dtype: int64

In [27]:
df.isnull().sum()


created_at        0
tree_dbh          0
curb_loc          0
health            0
steward       29409
guards        29510
sidewalk          0
user_type         0
problems      24288
spc_common        0
spc_latin         0
nta               0
nta_name          0
borocode          0
boro_ct           0
boroname          0
zip_city          0
cb_num            0
st_senate         0
st_assem          0
cncldist          0
is_test           0
dtype: int64

In [28]:
df.fillna('None', inplace=True)

In [29]:
df.isnull().sum()


created_at    0
tree_dbh      0
curb_loc      0
health        0
steward       0
guards        0
sidewalk      0
user_type     0
problems      0
spc_common    0
spc_latin     0
nta           0
nta_name      0
borocode      0
boro_ct       0
boroname      0
zip_city      0
cb_num        0
st_senate     0
st_assem      0
cncldist      0
is_test       0
dtype: int64

In [30]:
df['problems'].value_counts()


problems
None                                    24288
Stones                                   4455
BranchLights                             2045
StonesBranchLights                       1529
BranchOther                               793
                                        ...  
StonesTrunkLightsBranchLights              15
StonesRootOtherTrunkOtherBranchOther       15
RootOtherWiresRopeTrunkLights              12
TrunkLightsBranchLightsBranchOther         12
WiresRopeTrunkLightsBranchLights            9
Name: count, Length: 74, dtype: int64

In [31]:
df['spc_common'].value_counts()


spc_common
London planetree       4339
pin oak                4254
cherry                 3345
Japanese zelkova       3016
littleleaf linden      2598
                       ... 
European beech            6
Kentucky yellowwood       5
pond cypress              3
Chinese chestnut          3
Himalayan cedar           1
Name: count, Length: 120, dtype: int64

In [32]:
df = pd.get_dummies(df, columns=['spc_common'], prefix='name', drop_first = True)
df.head()


Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test,name_American beech,name_American elm,name_American hophornbeam,name_American hornbeam,name_American linden,name_Amur cork tree,name_Amur maackia,name_Amur maple,name_Atlantic white cedar,name_Atlas cedar,name_Callery pear,name_Chinese chestnut,name_Chinese elm,name_Chinese fringetree,name_Chinese tree lilac,name_Cornelian cherry,name_English oak,name_European beech,name_European hornbeam,name_Himalayan cedar,name_Japanese hornbeam,name_Japanese maple,name_Japanese snowbell,name_Japanese tree lilac,name_Japanese zelkova,name_Kentucky coffeetree,name_Kentucky yellowwood,name_London planetree,name_Norway maple,name_Norway spruce,name_Ohio buckeye,name_Oklahoma redbud,name_Persian ironwood,name_Schumard's oak,name_Shantung maple,name_Siberian elm,name_Sophora,name_Turkish hazelnut,name_arborvitae,name_ash,name_bald cypress,name_bigtooth aspen,name_black cherry,name_black locust,name_black oak,name_black walnut,name_blackgum,name_blue spruce,name_bur oak,name_catalpa,name_cherry,name_cockspur hawthorn,name_common hackberry,name_crab apple,name_crepe myrtle,name_crimson king maple,name_cucumber magnolia,name_dawn redwood,name_eastern cottonwood,name_eastern hemlock,name_eastern redbud,name_eastern redcedar,name_empress tree,name_false cypress,name_flowering dogwood,name_ginkgo,name_golden raintree,name_green ash,name_hardy rubber tree,name_hawthorn,name_hedge maple,name_holly,name_honeylocust,name_horse chestnut,name_katsura tree,name_kousa dogwood,name_littleleaf linden,name_magnolia,name_maple,name_mulberry,name_northern red oak,name_pagoda dogwood,name_paper birch,name_paperbark maple,name_pignut hickory,name_pin oak,name_pine,name_pitch pine,name_pond cypress,name_purple-leaf plum,name_quaking aspen,name_red horse chestnut,name_red maple,name_red pine,name_river birch,name_sassafras,name_sawtooth oak,name_scarlet oak,name_serviceberry,name_shingle oak,name_silver birch,name_silver linden,name_silver maple,name_southern magnolia,name_spruce,name_sugar maple,name_swamp white oak,name_sweetgum,name_sycamore maple,name_tartar maple,name_tree of heaven,name_trident maple,name_tulip-poplar,name_two-winged silverbell,name_weeping willow,name_white ash,name_white oak,name_white pine,name_willow oak
0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [33]:
df['curb_loc'] = df['curb_loc'].map({'OnCurb': 1, 'OffsetFromCurb': 0})
df['user_type'] = df['user_type'].map({'Volunteer': 1, 'NYC Parks Staff': 2, 'TreesCount Staff': 3})
df['steward'] = df['steward'].map({'1or2': 1, '3or4': 2, '4orMore': 3, 'None': 0})
df['guards'] = df['guards'].map({'Harmful': 2, 'Helpful': 3, 'Unsure': 1, 'None': 0})
df['sidewalk'] = df['sidewalk'].map({'Damage': 1, 'NoDamage': 0})
df.head()


Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test,name_American beech,name_American elm,name_American hophornbeam,name_American hornbeam,name_American linden,name_Amur cork tree,name_Amur maackia,name_Amur maple,name_Atlantic white cedar,name_Atlas cedar,name_Callery pear,name_Chinese chestnut,name_Chinese elm,name_Chinese fringetree,name_Chinese tree lilac,name_Cornelian cherry,name_English oak,name_European beech,name_European hornbeam,name_Himalayan cedar,name_Japanese hornbeam,name_Japanese maple,name_Japanese snowbell,name_Japanese tree lilac,name_Japanese zelkova,name_Kentucky coffeetree,name_Kentucky yellowwood,name_London planetree,name_Norway maple,name_Norway spruce,name_Ohio buckeye,name_Oklahoma redbud,name_Persian ironwood,name_Schumard's oak,name_Shantung maple,name_Siberian elm,name_Sophora,name_Turkish hazelnut,name_arborvitae,name_ash,name_bald cypress,name_bigtooth aspen,name_black cherry,name_black locust,name_black oak,name_black walnut,name_blackgum,name_blue spruce,name_bur oak,name_catalpa,name_cherry,name_cockspur hawthorn,name_common hackberry,name_crab apple,name_crepe myrtle,name_crimson king maple,name_cucumber magnolia,name_dawn redwood,name_eastern cottonwood,name_eastern hemlock,name_eastern redbud,name_eastern redcedar,name_empress tree,name_false cypress,name_flowering dogwood,name_ginkgo,name_golden raintree,name_green ash,name_hardy rubber tree,name_hawthorn,name_hedge maple,name_holly,name_honeylocust,name_horse chestnut,name_katsura tree,name_kousa dogwood,name_littleleaf linden,name_magnolia,name_maple,name_mulberry,name_northern red oak,name_pagoda dogwood,name_paper birch,name_paperbark maple,name_pignut hickory,name_pin oak,name_pine,name_pitch pine,name_pond cypress,name_purple-leaf plum,name_quaking aspen,name_red horse chestnut,name_red maple,name_red pine,name_river birch,name_sassafras,name_sawtooth oak,name_scarlet oak,name_serviceberry,name_shingle oak,name_silver birch,name_silver linden,name_silver maple,name_southern magnolia,name_spruce,name_sugar maple,name_swamp white oak,name_sweetgum,name_sycamore maple,name_tartar maple,name_tree of heaven,name_trident maple,name_tulip-poplar,name_two-winged silverbell,name_weeping willow,name_white ash,name_white oak,name_white pine,name_willow oak
0,2015-06-29,14,1,1,0,0,1,1,,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2016-09-21,5,1,1,2,3,0,1,,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2015-09-13,26,1,2,0,0,0,1,StonesBranchLights,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2016-05-09,15,1,0,0,0,1,2,,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2016-06-24,23,1,1,0,0,0,1,Stones,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [34]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test,name_American beech,name_American elm,name_American hophornbeam,name_American hornbeam,name_American linden,name_Amur cork tree,name_Amur maackia,name_Amur maple,name_Atlantic white cedar,name_Atlas cedar,name_Callery pear,name_Chinese chestnut,name_Chinese elm,name_Chinese fringetree,name_Chinese tree lilac,name_Cornelian cherry,name_English oak,name_European beech,name_European hornbeam,name_Himalayan cedar,name_Japanese hornbeam,name_Japanese maple,name_Japanese snowbell,name_Japanese tree lilac,name_Japanese zelkova,name_Kentucky coffeetree,name_Kentucky yellowwood,name_London planetree,name_Norway maple,name_Norway spruce,name_Ohio buckeye,name_Oklahoma redbud,name_Persian ironwood,name_Schumard's oak,name_Shantung maple,name_Siberian elm,name_Sophora,name_Turkish hazelnut,name_arborvitae,name_ash,name_bald cypress,name_bigtooth aspen,name_black cherry,name_black locust,name_black oak,name_black walnut,name_blackgum,name_blue spruce,name_bur oak,name_catalpa,name_cherry,name_cockspur hawthorn,name_common hackberry,name_crab apple,name_crepe myrtle,name_crimson king maple,name_cucumber magnolia,name_dawn redwood,name_eastern cottonwood,name_eastern hemlock,name_eastern redbud,name_eastern redcedar,name_empress tree,name_false cypress,name_flowering dogwood,name_ginkgo,name_golden raintree,name_green ash,name_hardy rubber tree,name_hawthorn,name_hedge maple,name_holly,name_honeylocust,name_horse chestnut,name_katsura tree,name_kousa dogwood,name_littleleaf linden,name_magnolia,name_maple,name_mulberry,name_northern red oak,name_pagoda dogwood,name_paper birch,name_paperbark maple,name_pignut hickory,name_pin oak,name_pine,name_pitch pine,name_pond cypress,name_purple-leaf plum,name_quaking aspen,name_red horse chestnut,name_red maple,name_red pine,name_river birch,name_sassafras,name_sawtooth oak,name_scarlet oak,name_serviceberry,name_shingle oak,name_silver birch,name_silver linden,name_silver maple,name_southern magnolia,name_spruce,name_sugar maple,name_swamp white oak,name_sweetgum,name_sycamore maple,name_tartar maple,name_tree of heaven,name_trident maple,name_tulip-poplar,name_two-winged silverbell,name_weeping willow,name_white ash,name_white oak,name_white pine,name_willow oak
0,2015-06-29,14,1,1,0,0,1,1,,Quercus robur,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2016-09-21,5,1,1,2,3,0,1,,Acer platanoides 'Crimson King',BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2015-09-13,26,1,2,0,0,0,1,StonesBranchLights,Quercus robur,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2016-05-09,15,1,0,0,0,1,2,,Gleditsia triacanthos var. inermis,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,2016-06-24,23,1,1,0,0,0,1,Stones,Platanus x acerifolia,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [35]:
df.describe()


Unnamed: 0,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,borocode,boro_ct,cb_num,st_senate,st_assem,cncldist,is_test
count,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0,39686.0
mean,11.64781,0.939122,-0.06438,0.313335,0.660888,0.466386,1.724311,3.360203,3405672.0,343.18921,21.052487,51.744494,30.281485,0.496447
std,8.202106,0.239109,0.979576,0.571127,1.185038,0.498875,0.878662,1.227749,1235439.0,121.851725,7.198739,18.657595,14.773305,0.499994
min,1.0,0.0,-1.0,0.0,0.0,0.0,1.0,1.0,1000202.0,101.0,10.0,23.0,1.0,0.0
25%,5.0,1.0,-1.0,0.0,0.0,0.0,1.0,3.0,3005900.0,301.0,15.0,33.0,19.0,0.0
50%,9.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,4012100.0,402.0,22.0,55.0,31.0,0.0
75%,17.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,4119500.0,413.0,25.0,64.0,44.0,1.0
max,44.0,1.0,2.0,3.0,3.0,1.0,3.0,5.0,5031902.0,503.0,36.0,87.0,51.0,1.0


In [36]:
df.loc[df['guards'].isnull()]


Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test,name_American beech,name_American elm,name_American hophornbeam,name_American hornbeam,name_American linden,name_Amur cork tree,name_Amur maackia,name_Amur maple,name_Atlantic white cedar,name_Atlas cedar,name_Callery pear,name_Chinese chestnut,name_Chinese elm,name_Chinese fringetree,name_Chinese tree lilac,name_Cornelian cherry,name_English oak,name_European beech,name_European hornbeam,name_Himalayan cedar,name_Japanese hornbeam,name_Japanese maple,name_Japanese snowbell,name_Japanese tree lilac,name_Japanese zelkova,name_Kentucky coffeetree,name_Kentucky yellowwood,name_London planetree,name_Norway maple,name_Norway spruce,name_Ohio buckeye,name_Oklahoma redbud,name_Persian ironwood,name_Schumard's oak,name_Shantung maple,name_Siberian elm,name_Sophora,name_Turkish hazelnut,name_arborvitae,name_ash,name_bald cypress,name_bigtooth aspen,name_black cherry,name_black locust,name_black oak,name_black walnut,name_blackgum,name_blue spruce,name_bur oak,name_catalpa,name_cherry,name_cockspur hawthorn,name_common hackberry,name_crab apple,name_crepe myrtle,name_crimson king maple,name_cucumber magnolia,name_dawn redwood,name_eastern cottonwood,name_eastern hemlock,name_eastern redbud,name_eastern redcedar,name_empress tree,name_false cypress,name_flowering dogwood,name_ginkgo,name_golden raintree,name_green ash,name_hardy rubber tree,name_hawthorn,name_hedge maple,name_holly,name_honeylocust,name_horse chestnut,name_katsura tree,name_kousa dogwood,name_littleleaf linden,name_magnolia,name_maple,name_mulberry,name_northern red oak,name_pagoda dogwood,name_paper birch,name_paperbark maple,name_pignut hickory,name_pin oak,name_pine,name_pitch pine,name_pond cypress,name_purple-leaf plum,name_quaking aspen,name_red horse chestnut,name_red maple,name_red pine,name_river birch,name_sassafras,name_sawtooth oak,name_scarlet oak,name_serviceberry,name_shingle oak,name_silver birch,name_silver linden,name_silver maple,name_southern magnolia,name_spruce,name_sugar maple,name_swamp white oak,name_sweetgum,name_sycamore maple,name_tartar maple,name_tree of heaven,name_trident maple,name_tulip-poplar,name_two-winged silverbell,name_weeping willow,name_white ash,name_white oak,name_white pine,name_willow oak


In [37]:
df.fillna('NULL', inplace=True)

In [38]:
df.loc[df['guards'].isnull()]


Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,spc_latin,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist,is_test,name_American beech,name_American elm,name_American hophornbeam,name_American hornbeam,name_American linden,name_Amur cork tree,name_Amur maackia,name_Amur maple,name_Atlantic white cedar,name_Atlas cedar,name_Callery pear,name_Chinese chestnut,name_Chinese elm,name_Chinese fringetree,name_Chinese tree lilac,name_Cornelian cherry,name_English oak,name_European beech,name_European hornbeam,name_Himalayan cedar,name_Japanese hornbeam,name_Japanese maple,name_Japanese snowbell,name_Japanese tree lilac,name_Japanese zelkova,name_Kentucky coffeetree,name_Kentucky yellowwood,name_London planetree,name_Norway maple,name_Norway spruce,name_Ohio buckeye,name_Oklahoma redbud,name_Persian ironwood,name_Schumard's oak,name_Shantung maple,name_Siberian elm,name_Sophora,name_Turkish hazelnut,name_arborvitae,name_ash,name_bald cypress,name_bigtooth aspen,name_black cherry,name_black locust,name_black oak,name_black walnut,name_blackgum,name_blue spruce,name_bur oak,name_catalpa,name_cherry,name_cockspur hawthorn,name_common hackberry,name_crab apple,name_crepe myrtle,name_crimson king maple,name_cucumber magnolia,name_dawn redwood,name_eastern cottonwood,name_eastern hemlock,name_eastern redbud,name_eastern redcedar,name_empress tree,name_false cypress,name_flowering dogwood,name_ginkgo,name_golden raintree,name_green ash,name_hardy rubber tree,name_hawthorn,name_hedge maple,name_holly,name_honeylocust,name_horse chestnut,name_katsura tree,name_kousa dogwood,name_littleleaf linden,name_magnolia,name_maple,name_mulberry,name_northern red oak,name_pagoda dogwood,name_paper birch,name_paperbark maple,name_pignut hickory,name_pin oak,name_pine,name_pitch pine,name_pond cypress,name_purple-leaf plum,name_quaking aspen,name_red horse chestnut,name_red maple,name_red pine,name_river birch,name_sassafras,name_sawtooth oak,name_scarlet oak,name_serviceberry,name_shingle oak,name_silver birch,name_silver linden,name_silver maple,name_southern magnolia,name_spruce,name_sugar maple,name_swamp white oak,name_sweetgum,name_sycamore maple,name_tartar maple,name_tree of heaven,name_trident maple,name_tulip-poplar,name_two-winged silverbell,name_weeping willow,name_white ash,name_white oak,name_white pine,name_willow oak


In [39]:
df.columns

Index(['created_at', 'tree_dbh', 'curb_loc', 'health', 'steward', 'guards',
       'sidewalk', 'user_type', 'problems', 'spc_latin',
       ...
       'name_tartar maple', 'name_tree of heaven', 'name_trident maple',
       'name_tulip-poplar', 'name_two-winged silverbell',
       'name_weeping willow', 'name_white ash', 'name_white oak',
       'name_white pine', 'name_willow oak'],
      dtype='object', length=140)

In [49]:
# 'is_test' 列を使って元の train と test データセットに戻す
train = df[df['is_test'] == 0].drop(columns=['is_test'])
test = df[df['is_test'] == 1].drop(columns=['is_test', 'health'])

In [50]:
columns_to_drop = ['created_at', 'problems', 'spc_latin', 'nta', 'nta_name', 'boroname', 'zip_city']
X = train.drop(columns_to_drop, axis=1)
y = train['health']


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)


In [60]:
y_train.value_counts(normalize=True)


health
1    0.788164
0    0.176875
2    0.034961
Name: proportion, dtype: float64

In [61]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)


(0.7881638644248732, 0.7882305844675741)

In [62]:
rfc_1 = RandomForestClassifier(random_state = 42)
rfc_1.fit(X_train, y_train)
rfc_1.score(X_train, y_train), rfc_1.score(X_test, y_test) 


(1.0, 0.9973979183346677)

In [56]:
rfc_2 = RandomForestClassifier(n_estimators = 100, max_depth = 6, random_state = 42)
rfc_2.fit(X_train, y_train)
rfc_2.score(X_train, y_train), rfc_2.score(X_test, y_test)


(0.9599679743795037, 0.9575660528422738)

In [57]:
dt_1 = DecisionTreeClassifier(random_state = 42)
dt_1.fit(X_train, y_train)
dt_1.score(X_train, y_train), dt_1.score(X_test, y_test) 


(1.0, 1.0)

In [58]:
dt_2 = DecisionTreeClassifier(max_depth = 5, 
                            min_samples_split = 10, 
                            min_samples_leaf = 3, 
                            random_state = 42)
dt_2.fit(X_train, y_train)
dt_2.score(X_train, y_train), dt_2.score(X_test, y_test) 


(1.0, 1.0)