データを加工するノートブック

# 秋野編集

## Word2vec

### ライブラリのインポート

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


### データ読み込み

In [None]:
# データフォルダの場所を設定
dataPath = "/data"

# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")
scores = df["score"]

### 前処理

In [None]:
# nltkデータ読み込み
nltk.download('punkt')

In [None]:
# 各テキストのベクトルを計算する関数
def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
def get_text_vectors(train_df):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    

    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # Word2Vecモデルの訓練
    word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors, word2vec_model

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
text_vectors, word2vec_model = get_text_vectors(df)

In [None]:
text_vectors_df = pd.DataFrame(text_vectors)
text_vectors_df['score'] = df[['score']].copy()
# text_vectors_df

In [None]:
def get_text_vectors_with_model(train_df, word2vec_model):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    
    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors

### 学習

In [None]:
def train_by_randomForest(text_vectors_df, n_estimators, random_state=42):
    feature_columns = [i for i in text_vectors_df.columns if i != "score"]
    train_df = text_vectors_df[feature_columns]
    target = text_vectors_df[["score"]]

    # データを訓練セットをテストセットに分割
    X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)   

    # ランダムフォレスト分類器を訓練
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # テストデータに対する予測
    y_pred = model.predict(X_test)
    kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")
    print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

    return model

model = train_by_randomForest(text_vectors_df, n_estimators=100)  

### 予測とSubmission

In [None]:
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
test_text_vectors = get_text_vectors_with_model(test_df, word2vec_model)

# text_vectorを使って、予測の実行
test_pred = model.predict(test_text_vectors)

In [None]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = test_pred
submission_df.to_csv('submission.csv',index=False)

## BERT

### ライブラリインポート

In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from sklearn.utils import resample
import nltk
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support


### データ読み込み

In [None]:
# データフォルダの場所を設定
dataPath = "/data"
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

In [None]:
# データの数を確認
df.groupby('score').apply(lambda x:x['score'].count())

スコア6の数に合わせても、156 x 6 = 936個のデータが取れる

In [None]:
# サンプリングしたい総データ数
total_sample_size = 2000

# 各スコアからサンプリングするデータ数を計算
unique_scores = df['score'].unique()
min_count = min(df['score'].value_counts())
sample_per_score = total_sample_size // len(unique_scores)

# 各スコアごとにデータをサンプリング
sampled_data = []

for score in unique_scores:
    score_data = df[df['score'] == score]
    if len(score_data) >= sample_per_score:
        sampled = resample(score_data, n_samples=sample_per_score, random_state=42)
    else:
        sampled = score_data
    sampled_data.append(sampled)

# サンプルデータを結合
final_sample = pd.concat(sampled_data)

In [None]:
# データ数確認
final_sample.groupby('score').count()

### 学習

In [None]:
# 学習用データ定義
data = final_sample.copy()

data['score'] = data['score'] - 1  # スコアを0-5に変換

# データセットの分割
train_texts, val_texts, train_labels, val_labels = train_test_split(data['full_text'], data['score'], test_size=0.2, random_state=42, stratify=data['score'])

# トークナイザーの準備
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# カスタムデータセットの作成
class EssayDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = EssayDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=512)
val_dataset = EssayDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=512)

# モデルの準備
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# 評価指標の定義
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# トレーニング引数の設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    report_to="none"
)

# Trainerの作成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# トレーニングの実行
trainer.train()

# 評価の実行
trainer.evaluate()

# モデルの出力
model.save_pretrained('./bert-base-uncased-model-trained')

データ数:前半100個  
{'eval_loss': 1.7269268035888672,
 'eval_accuracy': 0.25,
 'eval_f1': 0.2375,
 'eval_precision': 0.48680555555555555,
 'eval_recall': 0.25,
 'eval_runtime': 3.1969,
 'eval_samples_per_second': 6.256,
 'eval_steps_per_second': 0.938,
 'epoch': 3.0}  
 データ数:前半1,000個 実行時間:23min  
 {'eval_loss': 1.074561357498169,
 'eval_accuracy': 0.52,
 'eval_f1': 0.48400593321739693,
 'eval_precision': 0.4655379723734051,
 'eval_recall': 0.52,
 'eval_runtime': 30.6655,
 'eval_samples_per_second': 6.522,
 'eval_steps_per_second': 0.815,
 'epoch': 3.0}  
 データ数:936個 各スコア156個ずつ 実行時間:21min  
 {'eval_loss': 1.1392934322357178,
 'eval_accuracy': 0.5,
 'eval_f1': 0.4120386813326839,
 'eval_precision': 0.5260695493022192,
 'eval_recall': 0.5,
 'eval_runtime': 29.1307,
 'eval_samples_per_second': 6.454,
 'eval_steps_per_second': 0.824,
 'epoch': 3.0}

In [None]:
# 分割しておいたテストデータを用いて、重み付きKappaの計算をする
# 評価モードに設定
model.eval()

# データをトークン化
encoded_input = tokenizer(val_texts.tolist(), padding=True, truncation=True, return_tensors='pt')

# モデルに入力を与えて推論を行う
with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1).tolist()

In [None]:
# 正解データからラベルを抽出
true_labels = val_labels.tolist()
# 予測データを取得
predicted_labels = predictions
# 重み付きKappaを計算
weighted_kappa = cohen_kappa_score(true_labels, predicted_labels, weights='quadratic')
print('重み付きKappa:', weighted_kappa)

### kaggle提出用

In [None]:
# (kaggle提出用)テスト用データへの適用
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")
test_texts = test_df['full_text'].copy()

# データをトークン化
test_encoded_input = tokenizer(test_texts.tolist(), padding=True, truncation=True, return_tensors='pt')

# モデルに入力を与えて推論を行う
with torch.no_grad():
    outputs = model(**test_encoded_input)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1).tolist()

In [None]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = predictions
submission_df['score'] = submission_df['score'] + 1 # 予測結果は0-5で出力されるので +1して元データの1-6に合わせる
submission_df

### 学習済BERTモデルを使った推論

In [None]:
def predict_with_bert(bert_model, input_token) -> list():
    """
    既存BERTモデルを使った推論

    Args:
        bert_model: 学習済BERTモデル
        input_token: torkenizeされた入力データ

    Returns:
        predictions: 推論結果のリスト
    """
    with torch.no_grad():
        outputs = bert_model(**input_token)
        logits = outputs.logits
        predictions = torch.argmax(logits, axis=1).tolist()

    return predictions


In [None]:
# 学習済モデルの読み出し
model_path = '/data/bert-base-uncased-model-trained'
if os.path.isdir(model_path):
    model_trained = BertForSequenceClassification.from_pretrained(model_path, num_labels=6)
    model_trained.eval()
else:
    print('モデルフォルダがない')

# tokenizerの読み出し
tokenizer_path = '/data/bert-base-uncased-tokenizer'
if os.path.isdir(tokenizer_path):
    tokenizer  = BertTokenizer.from_pretrained(tokenizer_path)
else:
    print('トークナイザーのフォルダがない')

In [None]:
# データの準備
eval_data = df.copy()
eval_data = df[~df['essay_id'].isin(final_sample['essay_id'])]
eval_data = eval_data.sample(100, random_state=42)
eval_texts = eval_data['full_text'].tolist()
input_token = tokenizer(eval_texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

In [None]:
predictions = predict_with_bert(model_trained, input_token)

# Kappaの計算
# 正解データからラベルを抽出
true_labels = eval_data['score'].tolist()
# 予測データを取得
predicted_labels = predictions
# 重み付きKappaを計算
weighted_kappa = cohen_kappa_score(true_labels, predicted_labels, weights='quadratic')
print('重み付きKappa:', weighted_kappa)

### 学習済BERTを使った推論2 kaggleのメモリ不足対策

- メモリ効率の高い推論ループを使用する  
- 評価時バッチサイズを 8 -> 4に変更する  
BERTモデルは上のコードから流用

In [None]:
# カスタムデータセットの作成
from torch.utils.data import Dataset
import torch

class EssayDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [None]:
# データの準備
eval_data = df.copy()
eval_data = df[~df['essay_id'].isin(final_sample['essay_id'])]
eval_data = eval_data.sample(100, random_state=42)
eval_dataset = EssayDataset(eval_data['full_text'].tolist(), tokenizer=tokenizer, max_len=512)

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

# 評価時のバッチサイズを指定
eval_batch_size = 4

# データローダーの作成
eval_loader = DataLoader(eval_dataset, batch_size=eval_batch_size)

# モデルを評価モードに設定
model_trained.eval()
device = torch.device("cpu")

# 推論結果を保存するリスト
all_predictions = []

with torch.no_grad():
    for batch in tqdm(eval_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model_trained(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_predictions.extend(predictions.cpu().numpy())

# 予測結果の確認
print(all_predictions)


In [None]:
# Kappaの計算
# 正解データからラベルを抽出
true_labels = eval_data['score'].tolist()
# 予測データを取得
predicted_labels = all_predictions
# 重み付きKappaを計算
weighted_kappa = cohen_kappa_score(true_labels, predicted_labels, weights='quadratic')
print('重み付きKappa:', weighted_kappa)

## DeBERTa

- BERTの改良型で、計算効率向上ができるかもしれない  
- 現状のBERTは学習自体に時間かかる & kaggle提出時の推論もかなり時間かかっているので、改善するか試してみる

### インポート

In [15]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import resample

### データ読み出し

In [16]:
# データフォルダの場所を設定
dataPath = "/data"

# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

In [17]:
# サンプリングしたい総データ数
total_sample_size = 2000

# 各スコアからサンプリングするデータ数を計算
unique_scores = df['score'].unique()
min_count = min(df['score'].value_counts())
sample_per_score = total_sample_size // len(unique_scores)

# 各スコアごとにデータをサンプリング
sampled_data = []

for score in unique_scores:
    score_data = df[df['score'] == score]
    if len(score_data) >= sample_per_score:
        sampled = resample(score_data, n_samples=sample_per_score, random_state=42)
    else:
        sampled = score_data
    sampled_data.append(sampled)

# サンプルデータを結合
final_sample = pd.concat(sampled_data)

In [18]:
# 学習用データ定義
data = final_sample.copy()
data['score'] = data['score'] - 1  # スコアを0-5に変換

# データセットの分割
train_texts, val_texts, train_labels, val_labels = train_test_split(data['full_text'], data['score'], test_size=0.2)

# DeBERTa用トークナイザーの準備
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

In [19]:
# カスタムデータセットの作成
class EssayDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [20]:
# カスタムデータセット化
train_dataset = EssayDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, max_len=512)
val_dataset = EssayDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, max_len=512)

In [21]:
# DeBERTaモデルの準備
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=6)


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# 評価指標の定義
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='weighted')
    acc = accuracy_score(labels, pred)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [23]:
# トレーニング引数の設定
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    report_to="none"
)



In [24]:
# Trainerの作成
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [1]:
# すぐカーネルクラッシュしてしまうので、DeBERTa学習はローカルで行い、学習済モデルを使用することにする

# # トレーニングの実行
# trainer.train()

# # 評価の実行
# trainer.evaluate()

### 学習済DeBERTaを使った推論

In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DebertaTokenizer, DebertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import resample

2024-06-24 02:30:44.800767: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-24 02:30:44.868506: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-24 02:30:45.166900: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-24 02:30:45.166944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-24 02:30:45.220081: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [5]:
# データフォルダの場所を設定
dataPath = "/data"

# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")

# 学習用データ定義
data = df.copy()
data['score'] = data['score'] - 1  # スコアを0-5に変換

# データセットの分割
train_texts, val_texts, train_labels, val_labels = train_test_split(data['full_text'], data['score'], test_size=0.2, random_state=42)

In [6]:
# ローカルに保存したトークナイザーのロード
tokenizer = DebertaTokenizer.from_pretrained('/data/deberta-tokenizer')

In [7]:
# カスタムデータセットの作成
class EssayDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [12]:
# 動作確認用のテストデータ
val_dataset = EssayDataset(val_texts.tolist(), tokenizer=tokenizer, max_len=512)

In [10]:
# 学習済モデルのロード
model = DebertaForSequenceClassification.from_pretrained('/data/deberta-model-trained', num_labels=6)

In [13]:
# データローダーの作成
val_loader = DataLoader(val_dataset, batch_size=4)

In [14]:
# モデルを評価モードに設定
model.eval()

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [16]:
# 推論の実行
predictions = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)


In [18]:
# Kappaの計算
# 正解データからラベルを抽出
true_labels = val_labels.tolist()
# 予測データを取得
predicted_labels = predictions
# 重み付きKappaを計算
weighted_kappa = cohen_kappa_score(true_labels, predicted_labels, weights='quadratic')
print('重み付きKappa:', weighted_kappa)

重み付きKappa: 0.7520735832970541


# 岡本編集

## 必要なライブラリインポート

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

## データ確認

In [None]:
train_df = pd.read_csv('./data/train.csv')
train_df

In [None]:
train_df.shape

In [None]:
test_df = pd.read_csv('./data/test.csv')
test_df

In [None]:
test_df.shape

In [None]:
train_df.isnull().sum()

In [None]:
train_df.dtypes.to_frame().T

## 特徴量作成

In [None]:
def processing(df):
    """特徴量作成関数

    特徴量の説明
        text_len:テキストの長さ
        space_count:空白の数
        word_len_avg:一節の平均的な長さ
        I-cnt:”私”という単語の出現頻度

    Args:
        df(pandas.DataFrame):加工したいデータフレーム
    Return:
        pandas.DataFrame:加工後のデータフレーム

    """
    
    df['text_len'] = df.full_text.str.len()
    df['space_count'] = df.full_text.str.count(' ')
    df['word_len_avg'] = (df.text_len - df.space_count) / (df.space_count + 1)
    df['I-cnt'] = df.full_text.str.startswith('I') + df.full_text.str.count('. I ')
    return df

In [None]:
processed_train = processing(train_df)
processed_train.head()

In [None]:
processed_test = processing(test_df)
processed_test.head()

## 単語特徴量作成

### 単語の出現頻度確認

In [None]:
def check_freq(df):
    #インスタンス生成
    vec_count = CountVectorizer()
    vec_count.fit(df.full_text)
    X = vec_count.transform(df.full_text)
    #単語をカラム化してデータフレームに追加
    word_df = pd.DataFrame(X.toarray())
    word_df.columns = vec_count.get_feature_names_out()
    #df = pd.concat([df, word_df], axis=1)
    #単語の出現頻度データフレーム作成
    #word_df = pd.DataFrame(word_df.sum(axis=0).sort_values(ascending=False).reset_index())
    #word_df.columns = ['word', 'count']
    return word_df

def split_data(df):
    num = len(df) // 4
    q2 = num * 2
    q3 = num * 3
    q1_df = df.iloc[:num,:]
    q2_df = df.iloc[num:q2,:]
    q3_df = df.iloc[q2:q3,:]
    q4_df = df.iloc[q3:,:]
    return q1_df,q2_df,q3_df,q4_df

In [None]:
word_df_train = check_freq(processed_train)

In [None]:
train_df = pd.concat([processed_train,word_df_train],axis=1)

In [None]:
word_df_test = check_freq(processed_test)
test_df = pd.concat([processed_test,word_df_test],axis=1)

In [None]:
train_df.to_csv('/data/add_word_train.csv',index=False)
test_df.to_csv('/data/add_word_test.csv',index=False)

### ストップワード削除

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('/data/add_word_train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/data/add_word_test.csv')
test_df.head()

In [None]:
#英語のストップワードダウンロード
nltk.download('stopwords')
stop_words = stopwords.words('english')
#ストップワードを集合として格納（後で集合同士の比較演算を行うため）
stop_words_set = set(stop_words)

In [None]:
def drop_stopword(df):
    
    #単語データフレーム内にどれだけストップワードが含まれるのか確認
    columns_set = set(df.columns)

    #共通単語抽出
    and_set = columns_set & stop_words_set

    #単語データフレームに存在しなかったストップワード数抽出
    before = len(df.columns)
    tmp_df = df.drop(columns=list(and_set))
    after = len(tmp_df.columns)

    #ストップワードの除去、除去前後で矛盾がないか確認
    print(f'処理前カラム数：{before} 処理後カラム数：{after} 差：{before-after}')
    if before - after == len(and_set):
        print('処理に問題はありません')
    else:
        print('処理に矛盾が発生しています')

    return tmp_df

In [None]:
train_df = drop_stopword(train_df)

In [None]:
test_df = drop_stopword(test_df)

In [None]:
tmp_df.sum().to_frame()

In [None]:
tmp_sum = tmp_df.sum(numeric_only=True).to_frame()
tmp_sum.columns = ['count']
tmp_sum = tmp_sum.sort_values('count',ascending=False).round(1)
tmp_sum

In [None]:
import matplotlib.pyplot as plt
import japanize_matplotlib
import numpy as np
%matplotlib inline

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=((15,8)))
x = np.arange(1,11)
y = []
l = [len(train_df.columns)] * 10
for i in range(1, 11):
    y.append(len(tmp_sum.query('count <= @i')))

ax[0].plot(x, y, marker='.', markersize=10)
ax[0].set_title('単語の出現頻度の閾値 VS 落とされる特徴量')
ax[0].set_xlabel('単語の出現頻度の閾値')
ax[0].set_ylabel('落とされる特徴量')
for i, txt in enumerate(y):
    ax[0].text(x[i], y[i], txt)

y = np.array(l) - np.array(y)
ax[1].plot(x, y, marker='.', markersize=10)
ax[1].set_title('単語の出現頻度の閾値 VS 残る特徴量数')
ax[1].set_xlabel('単語の出現頻度の閾値')
ax[1].set_ylabel('残る特徴量数')
for i, txt in enumerate(y):
    ax[1].text(x[i], y[i], txt)
plt.show()   

- 出現頻度が３以下の特徴量を削除することでレコード数よりも特徴量数を抑えられる
- ただ、テストデータも同様のデータ構造であることが前提となる

## 前処理後データ出力

In [None]:
train_df.to_csv('./data/processed_train.csv', index=False)

In [None]:
test_df.to_csv('./data/processed_test.csv', index=False)