In [12]:
#%%
import numpy as np
import pandas as pd
import pickle
import category_encoders as ce

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [13]:

db_genba = pd.read_csv("train_genba.tsv",encoding="utf8",sep="\t")
db_goto = pd.read_csv("train_goto.tsv",encoding="utf8",sep="\t")

#%%
db = pd.merge(db_genba, db_goto, on='pj_no', how='left')

#%%
Y = db["keiyaku_pr"]
X = db.drop("keiyaku_pr",axis=1)

X = X.fillna(0)
Y = Y.fillna(0)



In [14]:
#%%

#タイプがobject(文字列)の列list
list_cols = X.columns[X.dtypes == "object"]
list_cols = list(list_cols)

#%%

# OneHotEncodeしたい列を指定。Nullや不明の場合の補完方法も指定。
ce_oe = ce.OrdinalEncoder(cols=list_cols,handle_unknown='impute')
X = ce_oe.fit_transform(X)



#%%



In [16]:
N_train = int(len(X.index) * 0.8)
N_test = len(X.index) - N_train

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=N_test,shuffle = False)

tuned_parameters = [{'n_estimators':[200,300]}]
clf = GridSearchCV(
    RandomForestRegressor(),
    tuned_parameters,
    cv=3,
    scoring='neg_median_absolute_error',
    n_jobs=-1
)

clf.fit(X_train,list(Y_train))

clf = clf.best_estimator_


#%%


In [19]:
pred = clf.predict(X_test)

In [34]:
def cal_mape(A,B):
    return(np.sum(np.abs((A-B)/B))*(100/len(A)))

In [35]:
cal_mape(pred,Y_test.values)

12.192776300696613

In [61]:
db_genba_t = pd.read_csv("test_genba.tsv",encoding="utf8",sep="\t")
db_goto_t = pd.read_csv("test_goto.tsv",encoding="utf8",sep="\t")

#%%
X_t = pd.merge(db_genba_t, db_goto_t, on='pj_no', how='left')

X_t = ce_oe.fit_transform(X_t)
X_t = X_t.fillna(0)

In [69]:
pred = clf.predict(X_t)
pred = np.round(pred)
print(pred)

[20702799. 26339844. 26581019. ... 29668359. 29219233. 27205090.]


In [70]:
pred = pred.astype(np.int64)
print(pred)

[20702799 26339844 26581019 ... 29668359 29219233 27205090]


In [71]:
db_sample = pd.read_csv("sample_submit.tsv",encoding="utf8",sep="\t", names=('name', 'keiyaku_pr'))

In [72]:
db_sample["keiyaku_pr"] = pred

In [73]:
db_sample

Unnamed: 0,name,keiyaku_pr
0,test_0000,20702799
1,test_0001,26339844
2,test_0002,26581019
3,test_0003,27193308
4,test_0004,27032226
5,test_0005,26582044
6,test_0006,26957933
7,test_0007,26803754
8,test_0008,26380478
9,test_0009,28603233


In [75]:
db_sample.to_csv('sample_submit.tsv',header=False,index=False,sep="\t")