データを加工するノートブック

# 秋野編集

## ライブラリのインポート

In [30]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


## データ読み込み

In [31]:
# データフォルダの場所を設定
dataPath = "/data"
dataPath

'/data'

In [32]:
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")
scores = df["score"]

## 前処理

In [33]:
# nltkデータ読み込み
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
# 各テキストのベクトルを計算する関数
def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [85]:
def get_text_vectors(train_df):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    

    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # Word2Vecモデルの訓練
    word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors, word2vec_model

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
text_vectors, word2vec_model = get_text_vectors(df)

In [93]:
text_vectors_df = pd.DataFrame(text_vectors)
text_vectors_df['score'] = df[['score']].copy()
# text_vectors_df

In [86]:
def get_text_vectors_with_model(train_df, word2vec_model):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    
    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors

In [71]:
# # テキストとスコアを取得
# texts = df["full_text"]
# scores = df["score"]

In [72]:
# # テキストをトークン化
# tokenized_texts = [word_tokenize(text.lower()) for text in texts]

In [73]:
# # Word2Vecモデルの訓練
# word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [40]:
# # 各テキストのベクトルを計算
# text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

## 学習

In [79]:
def train_by_randomForest(text_vectors_df, n_estimators, random_state=42):
    feature_columns = [i for i in text_vectors_df.columns if i != "score"]
    train_df = text_vectors_df[feature_columns]
    target = text_vectors_df[["score"]]

    # データを訓練セットをテストセットに分割
    X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)   

    # ランダムフォレスト分類器を訓練
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # テストデータに対する予測
    y_pred = model.predict(X_test)
    kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")
    print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

    return model

model = train_by_randomForest(text_vectors_df, n_estimators=100)  

  model.fit(X_train, y_train)


Weighted Kappa 二乗重み付け： 0.5174048390515701


In [41]:
# def train_by_randomForest(text_vectors, scores, n_estimators, random_state=42):
#     # データを訓練セットをテストセットに分割
#     X_train, X_test, y_train, y_test = train_test_split(text_vectors, scores, test_size=0.2, random_state=42)   

#     # ランダムフォレスト分類器を訓練
#     model = RandomForestClassifier(n_estimators=100, random_state=42)
#     model.fit(X_train, y_train)

#     # テストデータに対する予測
#     y_pred = model.predict(X_test)
#     kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")
#     print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

#     return model

# model = train_by_randomForest(text_vectors, scores, n_estimators=100)  

# 予測とSubmission

In [43]:
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
test_text_vectors = get_text_vectors_with_model(test_df, word2vec_model)

# text_vectorを使って、予測の実行
test_pred = model.predict(test_text_vectors)

In [53]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = test_pred
submission_df.to_csv('submission.csv',index=False)

In [34]:
# データを訓練セットをテストセットに分割
# X_train, X_test, y_train, y_test = train_test_split(text_vectors, scores, test_size=0.2, random_state=42)

In [35]:
# # ランダムフォレスト分類器を訓練
# classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# classifier.fit(X_train, y_train)

## 予測と精度確認

In [36]:
# # テストデータに対する予測
# y_pred = classifier.predict(X_test)

In [37]:
# # モデルの性能評価(Weighted Kappa)
# kappa_linear = cohen_kappa_score(y_test, y_pred, weights="linear")
# kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")

In [38]:
# print("Weighted Kappa 線型重み付け：", kappa_linear)
# print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

Weighted Kappa 線型重み付け： 0.34576085577220805
Weighted Kappa 二乗重み付け： 0.5068541564065192


In [39]:
# # (参考として、正解率での評価)モデルの性能評価
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.4431


In [87]:
# 結果がおかしいのでテスト
temp_df = pd.read_csv(dataPath + "/train.csv")

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
# temp_text_vectors = get_text_vectors(temp_df)
temp_text_vectors = get_text_vectors_with_model(temp_df, word2vec_model)

# text_vectorを使って、予測の実行
temp_pred = model.predict(temp_text_vectors)

temp_pred_df = temp_df[["essay_id"]].copy()
temp_pred_df['score'] = temp_pred
kappa = cohen_kappa_score(temp_df['score'], temp_pred_df['score'], weights="quadratic")

In [88]:
# kappa

0.11559028999374343

# 岡本編集

## 必要なライブラリインポート

In [11]:
import pandas as pd

## データ確認

In [12]:
train_df = pd.read_csv('./data/train.csv')
train_df

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [13]:
test_df = pd.read_csv('./data/test.csv')
test_df.head()

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [14]:
train_df.isnull().sum()

essay_id     0
full_text    0
score        0
dtype: int64

In [15]:
train_df.dtypes.to_frame().T

Unnamed: 0,essay_id,full_text,score
0,object,object,int64


## 特徴量作成

In [16]:
def processing(df):
    """特徴量作成関数

    特徴量の説明
        text_len:テキストの長さ
        space_count:空白の数
        word_len_avg:一節の平均的な長さ
        I-cnt:”私”という単語の出現頻度

    Args:
        df(pandas.DataFrame):加工したいデータフレーム
    Return:
        pandas.DataFrame:加工後のデータフレーム

    """
    
    df['text_len'] = df.full_text.str.len()
    df['space_count'] = df.full_text.str.count(' ')
    df['word_len_avg'] = (df.text_len - df.space_count) / (df.space_count + 1)
    df['I-cnt'] = df.full_text.str.startswith('I') + df.full_text.str.count('. I ')
    return df

In [17]:
train_df = processing(train_df)
train_df.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [18]:
test_df = processing(test_df)
test_df.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0


## 前処理後データ出力

In [23]:
train_df.to_csv('/data/processed_train.csv')

In [25]:
test_df.to_csv('/data/processed_test.csv')