In [2]:
########## 2回目
import pandas as pd
import numpy as np
import sys
sys.path.append("../_utils") # システムパスに対象ディレクトリを追加
import utils

train = pd.read_csv('./data/train.csv', index_col=0)
test = pd.read_csv('./data/test.csv', index_col=0)
sample_submit = pd.read_csv('./data/sample_submission.csv', index_col=0, header=None)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# カテゴリカルな特徴量を数値に変換する
categorical_cols = ['curb_loc','steward','guards','sidewalk','user_type','problems','spc_common','spc_latin','nta','nta_name','boroname','zip_city']

train = utils.bulk_label_encoding(train, categorical_cols)
test = utils.bulk_label_encoding(test, categorical_cols)

# 訓練データとテストデータに分割
# stratifyを設定して目的変数の分布を保ったまま分割
train, valid = train_test_split(train, test_size=0.2, stratify=train['health'], random_state=82)

# 使用する特徴量の選択
target_cols =['guards','problems','steward','spc_common','sidewalk','user_type','spc_latin']

# 目的変数とそれ以外に学習用データを分割
x_train = train[target_cols]
y_train = train['health']
x_valid = valid[target_cols]
y_valid = valid['health']
x_test = test[target_cols]

# モデルを作成して訓練
model = RandomForestClassifier()
model.fit(x_train, y_train)
print(x_valid)
# validの特徴量で予測
valid_predictions = model.predict(x_valid)
valid_f1 = f1_score(y_valid, valid_predictions, average='macro')
print(f"Validation F1 Score (Macro): {valid_f1}")

# 予測結果をCSVファイルとして保存
pred = model.predict(x_test)
sample_submit[1] = pred
sample_submit.to_csv('./submit/submit2.csv', header=None)

       guards  problems  steward  spc_common  sidewalk  user_type  spc_latin
17481       1        73        1          51         0          2         85
19264       1        73        0          37         0          2        105
5000        3        15        3          56         0          1          7
8185        3        73        3          28         1          1         81
10579       3        73        3          19         0          2         22
...       ...       ...      ...         ...       ...        ...        ...
4565        3        27        3          93         0          2          9
1742        3         0        3          26         0          1         53
37          3        73        0          56         1          2          7
14201       1        73        1          11         0          2         89
15441       3        73        3           5         0          1        112

[3997 rows x 7 columns]
Validation F1 Score (Macro): 0.31658116227382654


In [3]:
########## 3回目
import pandas as pd
import numpy as np
import sys
sys.path.append("../_utils") # システムパスに対象ディレクトリを追加
import utils

train = pd.read_csv('./data/train.csv', index_col=0)
test = pd.read_csv('./data/test.csv', index_col=0)
sample_submit = pd.read_csv('./data/sample_submission.csv', index_col=0, header=None)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# 欠損値あるカラムを可能な限り穴埋め
train["steward"] = train["steward"].fillna(method = 'bfill')
train["guards"] = train["guards"].fillna(method = 'bfill')
train["problems"] = train["problems"].fillna(method = 'bfill')
test["steward"] = test["steward"].fillna(method = 'bfill')
test["guards"] = test["guards"].fillna(method = 'bfill')
test["problems"] = test["problems"].fillna(method = 'bfill')

# カテゴリカルな特徴量を数値に変換する
categorical_cols = ['curb_loc','steward','guards','sidewalk','user_type','problems','spc_common','spc_latin','nta','nta_name','boroname','zip_city']

train = utils.bulk_label_encoding(train, categorical_cols)
test = utils.bulk_label_encoding(test, categorical_cols)

# 訓練データとテストデータに分割
# stratifyを設定して目的変数の分布を保ったまま分割
train, valid = train_test_split(train, test_size=0.2, stratify=train['health'], random_state=82)

# 使用する特徴量の選択
target_cols =['guards','problems','steward','spc_common','sidewalk','user_type','spc_latin']

# 目的変数とそれ以外に学習用データを分割
x_train = train[target_cols]
y_train = train['health']
x_valid = valid[target_cols]
y_valid = valid['health']
x_test = test[target_cols]

# モデルを作成して訓練
model = RandomForestClassifier()
model.fit(x_train, y_train)
print(x_valid)
# validの特徴量で予測
valid_predictions = model.predict(x_valid)
valid_f1 = f1_score(y_valid, valid_predictions, average='macro')
print(f"Validation F1 Score (Macro): {valid_f1}")

# 予測結果をCSVファイルとして保存
pred = model.predict(x_test)
sample_submit[1] = pred
sample_submit.to_csv('./submit/submit3.csv', header=None)

       guards  problems  steward  spc_common  sidewalk  user_type  spc_latin
17481       1        27        1          51         0          2         85
19264       1         2        0          37         0          2        105
5000        1        15        0          56         0          1          7
8185        1         0        0          28         1          1         81
10579       1        48        0          19         0          2         22
...       ...       ...      ...         ...       ...        ...        ...
4565        1        27        1          93         0          2          9
1742        1         0        1          26         0          1         53
37          1        51        0          56         1          2          7
14201       1        27        1          11         0          2         89
15441       1         0        0           5         0          1        112

[3997 rows x 7 columns]
Validation F1 Score (Macro): 0.32622014408754235
