In [19]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [20]:
import datetime
from sklearn.preprocessing import LabelEncoder

# 目的変数を抽出
revenue = df_train["revenue"]
del df_train["revenue"]

In [21]:
# 前処理がしやすい様に、trainとtestを結合
df_whole = pd.concat([df_train, df_test], axis = 0)

# Open Dateを年、月、日に分解
df_whole["Open Date"] = pd.to_datetime(df_whole["Open Date"])
df_whole["Year"] = df_whole["Open Date"].apply(lambda x:x.year)
df_whole["Month"] = df_whole["Open Date"].apply(lambda x:x.month)
df_whole["Day"] = df_whole["Open Date"].apply(lambda x:x.day)

In [22]:
# Cityを数値に変換
le = LabelEncoder()
df_whole["City"] = le.fit_transform(df_whole["City"])

df_whole.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P31,P32,P33,P34,P35,P36,P37,Year,Month,Day
0,0,1999-07-17,60,Big Cities,IL,4,5.0,4.0,4.0,2,...,3,4,5,5,4,3,4,1999,7,17
1,1,2008-02-14,4,Big Cities,FC,4,5.0,4.0,4.0,1,...,0,0,0,0,0,0,0,2008,2,14
2,2,2013-03-09,14,Other,IL,2,4.0,2.0,5.0,2,...,0,0,0,0,0,0,0,2013,3,9
3,3,2012-02-02,52,Other,IL,6,4.5,6.0,6.0,4,...,12,10,6,18,12,12,6,2012,2,2
4,4,2009-05-09,21,Other,IL,3,4.0,3.0,4.0,2,...,1,3,2,3,4,3,3,2009,5,9


In [23]:
# City GroupとTypeを数値に変換
df_whole["City Group"] = df_whole["City Group"].map({"Other":0, "Big Cities":1})
df_whole["Type"] = df_whole["Type"].map({"FC":0, "IL":1, "DT":2, "MB":3})

# 再びtrainとtestに分割
df_train = df_whole.iloc[:df_train.shape[0]]
df_test = df_whole.iloc[df_train.shape[0]:]

In [26]:
from sklearn.ensemble import RandomForestRegressor

# 学習に使う特徴量を取得
df_train_columns = [col for col in df_train.columns if col not in ["Id","Open Date"]]

# RandomForestで学習させる
rf = RandomForestRegressor(
    n_estimators = 200,
    max_depth = 5,
    max_features=0.5,
    random_state=449,
    n_jobs=-1)

rf.fit(df_train[df_train_columns], revenue)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features=0.5, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
           oob_score=False, random_state=449, verbose=0, warm_start=False)

In [27]:
prediction = rf.predict(df_test[df_train_columns])
submission = pd.DataFrame({"Id": df_test.Id, "Prediction":prediction})
submission.to_csv("TFI_submission.csv", index=False)