In [5]:
import pandas as pd
from datetime import datetime

# 学習用データと評価用データを読み込む
train = pd.read_csv('../data/smfg_train.csv', index_col=0)
test = pd.read_csv('../data/smfg_test.csv', index_col=0)

In [7]:
train[['created_at', 'created_at', 'health', 'tree_dbh', 'steward', 'spc_common', 'nta_name']].head()

Unnamed: 0,created_at,created_at.1,health,tree_dbh,steward,spc_common,nta_name
0,2015-06-29,2015-06-29,1,14,,English oak,Douglas Manor-Douglaston-Little Neck
1,2016-09-21,2016-09-21,1,5,3or4,crimson king maple,Bedford Park-Fordham North
2,2015-09-13,2015-09-13,2,26,,English oak,Annadale-Huguenot-Prince's Bay-Eltingville
3,2016-05-09,2016-05-09,0,15,,honeylocust,Charleston-Richmond Valley-Tottenville
4,2016-06-24,2016-06-24,1,23,,London planetree,Central Harlem North-Polo Grounds


In [8]:
test[['created_at', 'created_at', 'tree_dbh', 'steward', 'spc_common', 'nta_name']].head()

Unnamed: 0,created_at,created_at.1,tree_dbh,steward,spc_common,nta_name
19984,2015-09-08,2015-09-08,15,,Callery pear,Sheepshead Bay-Gerritsen Beach-Manhattan Beach
19985,2015-10-15,2015-10-15,5,1or2,cherry,Woodlawn-Wakefield
19986,2016-08-03,2016-08-03,4,,littleleaf linden,Kew Gardens
19987,2015-08-06,2015-08-06,7,,dawn redwood,Brooklyn Heights-Cobble Hill
19988,2015-10-21,2015-10-21,6,1or2,purple-leaf plum,Bedford


In [9]:
test.columns

Index(['created_at', 'tree_dbh', 'curb_loc', 'steward', 'guards', 'sidewalk',
       'user_type', 'problems', 'spc_common', 'spc_latin', 'nta', 'nta_name',
       'borocode', 'boro_ct', 'boroname', 'zip_city', 'cb_num', 'st_senate',
       'st_assem', 'cncldist'],
      dtype='object')

In [10]:
train.isnull().sum()

created_at        0
tree_dbh          0
curb_loc          0
health            0
steward       14883
guards        14943
sidewalk          0
user_type         0
problems      12243
spc_common        0
spc_latin         0
nta               0
nta_name          0
borocode          0
boro_ct           0
boroname          0
zip_city          0
cb_num            0
st_senate         0
st_assem          0
cncldist          0
dtype: int64

In [11]:
train['steward'].value_counts()


steward
1or2       3999
3or4       1079
4orMore      23
Name: count, dtype: int64

In [12]:
train.fillna('NULL', inplace=True)


In [13]:
train['problems'].nunique()


74

In [14]:
train['bool_problems'] = train['problems'].apply(lambda x: 0 if x=='NULL' else 1)

In [15]:
(
    train[['created_at', 'created_at', 'health', 
           'tree_dbh', 'steward', 'spc_common', 
           'nta_name', 'bool_problems']]
    .head()
)

Unnamed: 0,created_at,created_at.1,health,tree_dbh,steward,spc_common,nta_name,bool_problems
0,2015-06-29,2015-06-29,1,14,,English oak,Douglas Manor-Douglaston-Little Neck,0
1,2016-09-21,2016-09-21,1,5,3or4,crimson king maple,Bedford Park-Fordham North,0
2,2015-09-13,2015-09-13,2,26,,English oak,Annadale-Huguenot-Prince's Bay-Eltingville,1
3,2016-05-09,2016-05-09,0,15,,honeylocust,Charleston-Richmond Valley-Tottenville,0
4,2016-06-24,2016-06-24,1,23,,London planetree,Central Harlem North-Polo Grounds,1


In [16]:
test.fillna('NULL', inplace=True)
test['bool_problems'] = test['problems'].apply(lambda x: 0 if x=='NULL' else 1)


In [18]:
from sklearn.model_selection import train_test_split
# stratifyを設定することで目的変数の分布を保ったまま分割することができます。
train, valid = train_test_split(train, test_size=0.2, stratify=train['health'], random_state=82)


In [19]:
# 使用する特徴量の選択
select_cols = ['tree_dbh', 'curb_loc', 'sidewalk', 'steward', 'guards', 'user_type', 'bool_problems']

# 目的変数とそれ以外に学習用データを分割
x_train = train[select_cols]
y_train = train['health']
x_valid = valid[select_cols]
y_valid = valid['health']

# カテゴリのままでは学習できないのでワンホットエンコーディングで数値化
x_train = pd.get_dummies(x_train)
x_valid = pd.get_dummies(x_valid)
test =pd.get_dummies(test[select_cols])


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier()
model.fit(x_train, y_train)

valid_predictions = model.predict(x_valid)
valid_f1 = f1_score(y_valid, valid_predictions, average='macro')
print(f"Validation F1 Score (Macro): {valid_f1}")


Validation F1 Score (Macro): 0.3185537225121036


In [21]:
pred = model.predict(test)
print(pred[:5])


[1 1 1 1 1]


In [23]:
sample_submit = pd.read_csv('../data/sample_submission.csv', index_col=0, header=None) # 応募用サンプルファイル
sample_submit[1] = pred
sample_submit.to_csv('../submit/submit1.csv', header=None)