データを学習させるノートブック（予測値とモデル出力まで）

# 岡本編集

## 必要なライブラリインポート

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import cohen_kappa_score, make_scorer
import pandas as pd
import pickle

## データ読み込み

In [30]:
train_df = pd.read_csv('./data/processed_train.csv')
train_df.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [31]:
test_df = pd.read_csv('./data/processed_test.csv')
test_df.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0


## データ分割

In [32]:
def hold_out(df,columns):
    x = df.drop(columns=columns)
    y = df.score
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [33]:
X_train, X_test, y_train, y_test = hold_out(train_df, ['essay_id', 'full_text', 'score'])

## そのまま学習

In [34]:
clf = RandomForestClassifier(random_state=42)

In [35]:
clf.fit(X_train, y_train)

In [36]:
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

In [37]:
#訓練データの精度
print(f'訓練データ精度：{cohen_kappa_score(y_train, train_pred)}')
#テストデータ精度
print(f'テストデータ精度：{cohen_kappa_score(y_test, test_pred)}')

訓練データ精度：0.9922238694627876
テストデータ精度：0.30994044532622256


## ハイパーパラメーターチューニング

### 評価指標作成

In [38]:
kappa_score = make_scorer(cohen_kappa_score)

### GridSearchCV

In [39]:
param_grid = {'n_estimators': [100,300,500,1000], 'max_depth': [1,3,5,7,10], 'random_state': [42]}

In [40]:
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring=kappa_score)

In [41]:
grid_search.fit(X_train, y_train)

In [42]:
print(f'train_set_score: {grid_search.score(X_train,y_train)}')
print(f'test_set_score: {grid_search.score(X_test,y_test)}')
print(f'best_params: {grid_search.best_params_}')
print(f'best_cross_validation: {grid_search.best_score_}')

train_set_score: 0.41671774042756526
test_set_score: 0.38981859502169713
best_params: {'max_depth': 5, 'n_estimators': 500, 'random_state': 42}
best_cross_validation: 0.4073161086468035


## 精度確認

In [43]:
train_pred = grid_search.predict(X_train)
test_pred = grid_search.predict(X_test)

In [44]:
#訓練データの精度
print(f'訓練データ精度：{cohen_kappa_score(y_train, train_pred)}')
#テストデータ精度
print(f'テストデータ精度：{cohen_kappa_score(y_test, test_pred)}')

訓練データ精度：0.41671774042756526
テストデータ精度：0.38981859502169713


## 予測値出力

In [63]:
pred = grid_search.predict(test_df.drop(columns=['essay_id','full_text']))

In [64]:
submit = pd.concat([test_df.essay_id,pd.DataFrame(pred,columns=['score'])],axis=1)
submit

Unnamed: 0,essay_id,score
0,000d118,4
1,000fe60,3
2,001ab80,4


In [62]:
submit.to_csv('submission.csv',index=False)

## モデル出力

In [65]:
pickle.dump(grid_search,open('./model/RandomForestClassifier_1.pkl','wb'))