データを加工するノートブック

# ライブラリのインポート

In [21]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


# データ読み込み

In [22]:
# データフォルダの場所を設定
dataPath = "/data"
dataPath

'/data'

In [23]:
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

In [24]:
df

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


# 前処理

In [25]:
# テキストとスコアを取得
texts = df["full_text"]
scores = df["score"]

In [26]:
# データ確認
texts

0        Many people have car where they live. The thin...
1        I am a scientist at NASA that is discussing th...
2        People always wish they had the same technolog...
3        We all heard about Venus, the planet without a...
4        Dear, State Senator\n\nThis is a letter to arg...
                               ...                        
17302    the story " The Challenge of Exploing Venus " ...
17303    Technology has changed a lot of ways that we l...
17304    If you don't like sitting around all day than ...
17305    In "The Challenge of Exporing Venus," the auth...
17306    Venus is worthy place to study but dangerous. ...
Name: full_text, Length: 17307, dtype: object

In [27]:
# nltkデータ読み込み
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
# テキストをトークン化
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

In [29]:
tokenized_texts

[['many',
  'people',
  'have',
  'car',
  'where',
  'they',
  'live',
  '.',
  'the',
  'thing',
  'they',
  'do',
  "n't",
  'know',
  'is',
  'that',
  'when',
  'you',
  'use',
  'a',
  'car',
  'alot',
  'of',
  'thing',
  'can',
  'happen',
  'like',
  'you',
  'can',
  'get',
  'in',
  'accidet',
  'or',
  'the',
  'smoke',
  'that',
  'the',
  'car',
  'has',
  'is',
  'bad',
  'to',
  'breath',
  'on',
  'if',
  'someone',
  'is',
  'walk',
  'but',
  'in',
  'vauban',
  ',',
  'germany',
  'they',
  'dont',
  'have',
  'that',
  'proble',
  'because',
  '70',
  'percent',
  'of',
  'vauban',
  "'s",
  'families',
  'do',
  'not',
  'own',
  'cars',
  ',',
  'and',
  '57',
  'percent',
  'sold',
  'a',
  'car',
  'to',
  'move',
  'there',
  '.',
  'street',
  'parkig',
  ',',
  'driveways',
  'and',
  'home',
  'garages',
  'are',
  'forbidden',
  'on',
  'the',
  'outskirts',
  'of',
  'freiburd',
  'that',
  'near',
  'the',
  'french',
  'and',
  'swiss',
  'borders',
  '

In [30]:
# Word2Vecモデルの訓練
word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [31]:
# 各テキストのベクトルを計算する関数
def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [32]:
# 各テキストのベクトルを計算
text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

In [33]:
# データ確認
text_vectors

array([[ 1.7239848e-01,  6.2031221e-01, -2.4378542e-02, ...,
        -2.0997146e-01,  2.0425516e-01,  1.0026397e-02],
       [-2.8495753e-01,  3.1978268e-02,  2.0065497e-01, ...,
        -1.8763369e-01,  1.6444379e-01, -4.2046809e-01],
       [ 4.9929059e-01,  5.5229056e-01,  9.9139325e-02, ...,
        -3.1018013e-01, -5.2972108e-02, -1.4470154e-01],
       ...,
       [-2.2777826e-04,  2.0795637e-01, -5.8998758e-01, ...,
        -2.1518406e-01,  1.0861227e-01, -6.9553487e-02],
       [-9.9636674e-02, -1.1509982e+00,  3.5189712e-01, ...,
        -2.1143794e-01,  1.3887411e-01, -4.3226078e-01],
       [ 1.5850356e-01,  1.5751782e-01,  2.1229060e-01, ...,
        -6.8230134e-01,  8.5946344e-02,  6.5155298e-02]], dtype=float32)

# 学習

In [34]:
# データを訓練セットをテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(text_vectors, scores, test_size=0.2, random_state=42)

In [35]:
# ランダムフォレスト分類器を訓練
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# 予測と精度確認

In [36]:
# テストデータに対する予測
y_pred = classifier.predict(X_test)

In [37]:
# モデルの性能評価(Weighted Kappa)
kappa_linear = cohen_kappa_score(y_test, y_pred, weights="linear")
kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")

In [38]:
print("Weighted Kappa 線型重み付け：", kappa_linear)
print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

Weighted Kappa 線型重み付け： 0.34576085577220805
Weighted Kappa 二乗重み付け： 0.5068541564065192


In [39]:
# (参考として、正解率での評価)モデルの性能評価
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.4431
