データを学習させるノートブック（予測値とモデル出力まで）

# 岡本編集

## 必要なライブラリインポート

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import cohen_kappa_score, make_scorer
import pandas as pd
import pickle
import numpy as np

## データ読み込み

In [9]:
train_df = pd.read_csv('./data/processed_train.csv')
train_df.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [10]:
test_df = pd.read_csv('./data/processed_test.csv')
test_df.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0


## データ分割

In [11]:
def hold_out(df,columns):
    x = df.drop(columns=columns)
    y = df.score
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = hold_out(train_df, ['essay_id', 'full_text', 'score'])

## RandomForestClassifier + GridSearchCV

### 評価指標作成

In [None]:
kappa_score = make_scorer(cohen_kappa_score)

### 探索空間定義・学習

In [None]:
param_grid = {'n_estimators': [100,300,500,1000], 'max_depth': [1,3,5,7,10], 'random_state': [42]}

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring=kappa_score)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print(f'train_set_score: {grid_search.score(X_train,y_train)}')
print(f'test_set_score: {grid_search.score(X_test,y_test)}')
print(f'best_params: {grid_search.best_params_}')
print(f'best_cross_validation: {grid_search.best_score_}')

### 精度確認

In [None]:
train_pred = grid_search.predict(X_train)
test_pred = grid_search.predict(X_test)

In [None]:
#訓練データの精度
print(f'訓練データ精度：{cohen_kappa_score(y_train, train_pred)}')
#テストデータ精度
print(f'テストデータ精度：{cohen_kappa_score(y_test, test_pred)}')

### 予測値出力

In [None]:
pred = grid_search.predict(test_df.drop(columns=['essay_id','full_text']))

In [None]:
submit = pd.concat([test_df.essay_id,pd.DataFrame(pred,columns=['score'])],axis=1)
submit

In [None]:
submit.to_csv('submission.csv',index=False)

### モデル出力

In [None]:
pickle.dump(grid_search,open('./model/RandomForestClassifier_1.pkl','wb'))

## RandomForest(GridSearchのパラメーターを使用)

In [None]:
clf = RandomForestClassifier(random_state=42,max_depth=5,n_estimators=500)

### 学習

In [None]:
clf.fit(X_train, y_train)

In [None]:
train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

### 予測精度

In [None]:
#訓練データの精度
print(f'訓練データ精度：{cohen_kappa_score(y_train, train_pred)}')
#テストデータ精度
print(f'テストデータ精度：{cohen_kappa_score(y_test, test_pred)}')

### モデル出力

In [None]:
pickle.dump(clf,open('./model/RandomForestClassifier_2.pkl','wb'))

### 予測値出力

In [None]:
pred = clf.predict(train_df.drop(columns=['essay_id','full_text','score']))

In [None]:
#予測値を訓練データに結合
processed_train_df = pd.concat([train_df,pd.DataFrame(pred,columns=['pred'])],axis=1)
processed_train_df

In [None]:
processed_train_df.isnull().sum()

### 予測値格納後データフレーム格納

In [None]:
#予測値を結合した訓練データを"processed_df_by_rf"としてcsv出力
processed_train_df.to_csv('./data/processed_df_by_rf.csv',index=False)

## LogisticRegressor + GridSearchCV

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')

In [47]:
X = bow_converter.fit_transform(train_df.full_text)

In [81]:
my_array = np.zeros((0,2))
for word, count in zip(bow_converter.get_feature_names_out(), X.toarray()[0, :]):
    tmp_array = np.array([word, count])
    my_array = np.vstack((my_array, tmp_array))

In [83]:
my_array.shape

(64613, 2)

In [85]:
word_df = pd.DataFrame(my_array,columns=['word','count'])

In [89]:
word_df.sort_values('count',ascending=False,inplace=True)

In [90]:
word_df

Unnamed: 0,word,count
4309,are,9
32592,is,8
61843,vauban,7
8961,can,7
64459,you,6
...,...,...
21580,equppied,0
21581,equptment,0
21582,equtmeant,0
21583,equvalent,0


In [98]:
word_df = word_df.astype({'count':'int'})

In [100]:
word_df.dtypes

word     object
count     int64
dtype: object