データを加工するノートブック

# 秋野編集

## Word2vec

### ライブラリのインポート

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


### データ読み込み

In [None]:
# データフォルダの場所を設定
dataPath = "/data"

# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")
scores = df["score"]

### 前処理

In [None]:
# nltkデータ読み込み
nltk.download('punkt')

In [None]:
# 各テキストのベクトルを計算する関数
def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
def get_text_vectors(train_df):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    

    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # Word2Vecモデルの訓練
    word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors, word2vec_model

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
text_vectors, word2vec_model = get_text_vectors(df)

In [None]:
text_vectors_df = pd.DataFrame(text_vectors)
text_vectors_df['score'] = df[['score']].copy()
# text_vectors_df

In [None]:
def get_text_vectors_with_model(train_df, word2vec_model):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    
    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors

### 学習

In [None]:
def train_by_randomForest(text_vectors_df, n_estimators, random_state=42):
    feature_columns = [i for i in text_vectors_df.columns if i != "score"]
    train_df = text_vectors_df[feature_columns]
    target = text_vectors_df[["score"]]

    # データを訓練セットをテストセットに分割
    X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)   

    # ランダムフォレスト分類器を訓練
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # テストデータに対する予測
    y_pred = model.predict(X_test)
    kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")
    print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

    return model

model = train_by_randomForest(text_vectors_df, n_estimators=100)  

### 予測とSubmission

In [None]:
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
test_text_vectors = get_text_vectors_with_model(test_df, word2vec_model)

# text_vectorを使って、予測の実行
test_pred = model.predict(test_text_vectors)

In [None]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = test_pred
submission_df.to_csv('submission.csv',index=False)

## BERT

In [None]:
# データフォルダの場所を設定
dataPath = "/data"
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

In [None]:
# データの数を確認
df.groupby('score').apply(lambda x:x['score'].count())

スコア6の数に合わせても、156 x 6 = 936個のデータが取れる

In [None]:
# スコア毎に同数のデータを取得する
min_samples = df.groupby('score').size().min()
balanced_data = df.groupby('score').apply(lambda x:x.sample(n=min_samples)).reset_index(drop=True)
balanced_data

In [None]:
# chatGPTサンプルコード
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# データフォルダの場所を設定
dataPath = "/data"
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

# データセットの読み込み
# data = df.head(1000)

# スコア毎に同数のデータを取得する
min_samples = df.groupby('score').size().min()
data = df.groupby('score').apply(lambda x:x.sample(n=min_samples)).reset_index(drop=True)

data['score'] = data['score'] - 1  # スコアを0-5に変換

# データセットの分割
train_texts, val_texts, train_labels, val_labels = train_test_split(data['full_text'], data['score'], test_size=0.2, random_state=42, stratify=data['score'])

# トークナイザーの準備
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# カスタムデータセットの作成
class EssayDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = EssayDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=512)
val_dataset = EssayDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=512)

# モデルの準備
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# 評価指標の定義
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# トレーニング引数の設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    report_to="none"
)

# Trainerの作成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# トレーニングの実行
trainer.train()

# 評価の実行
trainer.evaluate()


データ数:前半100個  
{'eval_loss': 1.7269268035888672,
 'eval_accuracy': 0.25,
 'eval_f1': 0.2375,
 'eval_precision': 0.48680555555555555,
 'eval_recall': 0.25,
 'eval_runtime': 3.1969,
 'eval_samples_per_second': 6.256,
 'eval_steps_per_second': 0.938,
 'epoch': 3.0}  
 データ数:前半1,000個 実行時間:23min  
 {'eval_loss': 1.074561357498169,
 'eval_accuracy': 0.52,
 'eval_f1': 0.48400593321739693,
 'eval_precision': 0.4655379723734051,
 'eval_recall': 0.52,
 'eval_runtime': 30.6655,
 'eval_samples_per_second': 6.522,
 'eval_steps_per_second': 0.815,
 'epoch': 3.0}  
 データ数:936個 各スコア156個ずつ 実行時間:21min  
 {'eval_loss': 1.1392934322357178,
 'eval_accuracy': 0.5,
 'eval_f1': 0.4120386813326839,
 'eval_precision': 0.5260695493022192,
 'eval_recall': 0.5,
 'eval_runtime': 29.1307,
 'eval_samples_per_second': 6.454,
 'eval_steps_per_second': 0.824,
 'epoch': 3.0}

In [None]:
# 分割しておいたテストデータを用いて、重み付きKappaの計算をする
# 評価モードに設定
model.eval()

# データをトークン化
encoded_input = tokenizer(val_texts.tolist(), padding=True, truncation=True, return_tensors='pt')

# モデルに入力を与えて推論を行う
with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1).tolist()

In [None]:
# 正解データからラベルを抽出
true_labels = val_labels.tolist()
# 予測データを取得
predicted_labels = predictions
# 重み付きKappaを計算
weighted_kappa = cohen_kappa_score(true_labels, predicted_labels, weights='quadratic')

In [None]:
weighted_kappa

In [None]:
# (kaggle提出用)テスト用データへの適用
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")
test_texts = test_df['full_text'].copy()

# データをトークン化
test_encoded_input = tokenizer(test_texts.tolist(), padding=True, truncation=True, return_tensors='pt')

# モデルに入力を与えて推論を行う
with torch.no_grad():
    outputs = model(**test_encoded_input)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1).tolist()

In [None]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = predictions
submission_df['score'] = submission_df['score'] + 1 # 予測結果は0-5で出力されるので +1して元データの1-6に合わせる
submission_df

# 岡本編集

## 必要なライブラリインポート

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

## データ確認

In [2]:
train_df = pd.read_csv('./data/train.csv')
train_df

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [3]:
train_df.shape

(17307, 3)

In [4]:
test_df = pd.read_csv('./data/test.csv')
test_df

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [5]:
test_df.shape

(3, 2)

In [6]:
train_df.isnull().sum()

essay_id     0
full_text    0
score        0
dtype: int64

In [7]:
train_df.dtypes.to_frame().T

Unnamed: 0,essay_id,full_text,score
0,object,object,int64


## 特徴量作成

In [8]:
def processing(df):
    """特徴量作成関数

    特徴量の説明
        text_len:テキストの長さ
        space_count:空白の数
        word_len_avg:一節の平均的な長さ
        I-cnt:”私”という単語の出現頻度

    Args:
        df(pandas.DataFrame):加工したいデータフレーム
    Return:
        pandas.DataFrame:加工後のデータフレーム

    """
    
    df['text_len'] = df.full_text.str.len()
    df['space_count'] = df.full_text.str.count(' ')
    df['word_len_avg'] = (df.text_len - df.space_count) / (df.space_count + 1)
    df['I-cnt'] = df.full_text.str.startswith('I') + df.full_text.str.count('. I ')
    return df

In [9]:
processed_train = processing(train_df)
processed_train.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [10]:
processed_test = processing(test_df)
processed_test.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0


## テキスト翻訳

from googletrans import Translator

translator = Translator()

def translate_text(text):
    try:
        translated = translator.translate(text,src='en',dest='ja').text
        return translated
    except Exception as e:
            return None

tqdm.pandas()

train_df['text_ja'] = train_df.full_text.progress_apply(translate_text)

train_df.head()

#翻訳後データフレームをcsv出力
train_df.to_csv('./data/trandlated_df.csv',index=False)

## 単語特徴量作成

### 単語の出現頻度確認

In [11]:
def check_freq(df):
    #インスタンス生成
    vec_count = CountVectorizer()
    vec_count.fit(df.full_text)
    X = vec_count.transform(df.full_text)
    #単語をカラム化してデータフレームに追加
    word_df = pd.DataFrame(X.toarray(), columns=vec_count.get_feature_names_out())
    df = pd.concat([df, word_df], axis=1)
    #単語の出現頻度データフレーム作成
    word_df = pd.DataFrame(word_df.sum(axis=0).sort_values(ascending=False).reset_index())
    word_df.columns = ['word', 'count']
    return df,word_df

In [12]:
#訓練データのテキストを単語に分解
train_df, word_df_train = check_freq(processed_train)
display(train_df.head())
display(word_df_train.head())

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt,00,000,0000,...,zygomatiz,zygomstic,zygomtic,zygosmtic,²excerpt,¹excerpt,ät,årgument,ëvening,ö_o
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,word,count
0,the,369541
1,to,207130
2,and,146862
3,of,139419
4,that,115132


In [13]:
#テストデータのテキストを単語に分解
test_df, word_df_test = check_freq(processed_test)
display(test_df.head())
display(word_df_test.head())

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt,000,12,20,2006,...,worked,working,works,world,would,year,years,you,your,yourself
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1,1,1,1,1,...,0,0,0,1,0,1,1,6,1,0
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2,0,0,0,0,...,1,2,1,0,3,0,0,5,0,0
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


Unnamed: 0,word,count
0,the,57
1,to,45
2,that,43
3,of,35
4,and,29


### ストップワード削除

In [14]:
#英語のストップワードダウンロード
nltk.download('stopwords')
stop_words = stopwords.words('english')
#ストップワードを集合として格納（後で集合同士の比較演算を行うため）
stop_words_set = set(stop_words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def drop_stopword(df):
    
    #単語データフレーム内にどれだけストップワードが含まれるのか確認
    columns_set = set(df.columns)

    #共通単語抽出
    and_set = columns_set & stop_words_set

    #単語データフレームに存在しなかったストップワード数抽出
    before = len(df.columns)
    diff_list = list(columns_set - and_set)
    df = df[diff_list].copy()
    after = len(df.columns)

    #ストップワードの除去、除去前後で矛盾がないか確認
    print(f'処理前カラム数：{before}　処理後カラム数：{after}　差：{before-after}')    

    return df

: 

In [16]:
tmp_df = drop_stopword(train_df)

In [None]:
tmp_df = pd.concat([processed_train,tmp_df],axis=1)

In [None]:
tmp_df.head(1)

In [17]:
tmp_df.sum()

contunes        1
shafts          1
betrayel        1
coprrosive      1
disposed        4
               ..
barriers       15
mountrains      2
constructon     1
threapist       1
ansewr          1
Length: 64439, dtype: object

## 前処理後データ出力

In [None]:
train_df.to_csv('./data/processed_train.csv', index=False)

In [None]:
test_df.to_csv('./data/processed_test.csv', index=False)