データを加工するノートブック

# 秋野編集

## ライブラリのインポート

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


## データ読み込み

In [2]:
# データフォルダの場所を設定
dataPath = "/data"
dataPath

'/data'

In [3]:
# データの読み出し
df = pd.read_csv(dataPath + "/train.csv")
scores = df["score"]

## 前処理

In [4]:
# nltkデータ読み込み
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# 各テキストのベクトルを計算する関数
def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [6]:
def get_text_vectors(train_df):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    

    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # Word2Vecモデルの訓練
    word2vec_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors, word2vec_model

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
text_vectors, word2vec_model = get_text_vectors(df)

In [7]:
text_vectors_df = pd.DataFrame(text_vectors)
text_vectors_df['score'] = df[['score']].copy()
# text_vectors_df

In [8]:
def get_text_vectors_with_model(train_df, word2vec_model):
    # テキストとスコアを取得
    texts = train_df["full_text"]
    
    # テキストをトークン化
    tokenized_texts = [word_tokenize(text.lower()) for text in texts]

    # 各テキストのベクトルを計算
    text_vectors = np.array([text_to_vector(text, word2vec_model) for text in tokenized_texts])

    return text_vectors

## 学習

In [9]:
def train_by_randomForest(text_vectors_df, n_estimators, random_state=42):
    feature_columns = [i for i in text_vectors_df.columns if i != "score"]
    train_df = text_vectors_df[feature_columns]
    target = text_vectors_df[["score"]]

    # データを訓練セットをテストセットに分割
    X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)   

    # ランダムフォレスト分類器を訓練
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # テストデータに対する予測
    y_pred = model.predict(X_test)
    kappa_quadratic = cohen_kappa_score(y_test, y_pred, weights="quadratic")
    print("Weighted Kappa 二乗重み付け：", kappa_quadratic)

    return model

model = train_by_randomForest(text_vectors_df, n_estimators=100)  

  model.fit(X_train, y_train)


Weighted Kappa 二乗重み付け： 0.5101057705536902


## 予測とSubmission

In [10]:
# テストデータの読み出し
test_df = pd.read_csv(dataPath + "/test.csv")

# DataFrameに含まれたテキストデータから、トークン化されたtext_vectorを取得
test_text_vectors = get_text_vectors_with_model(test_df, word2vec_model)

# text_vectorを使って、予測の実行
test_pred = model.predict(test_text_vectors)

In [11]:
submission_df = test_df[["essay_id"]].copy()
submission_df['score'] = test_pred
submission_df.to_csv('submission.csv',index=False)

# 岡本編集

## 必要なライブラリインポート

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

## データ確認

In [2]:
train_df = pd.read_csv('./data/train.csv')
train_df

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [3]:
train_df.shape

(17307, 3)

In [4]:
test_df = pd.read_csv('./data/test.csv')
test_df

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [5]:
test_df.shape

(3, 2)

In [6]:
train_df.isnull().sum()

essay_id     0
full_text    0
score        0
dtype: int64

In [7]:
train_df.dtypes.to_frame().T

Unnamed: 0,essay_id,full_text,score
0,object,object,int64


## 特徴量作成

In [8]:
def processing(df):
    """特徴量作成関数

    特徴量の説明
        text_len:テキストの長さ
        space_count:空白の数
        word_len_avg:一節の平均的な長さ
        I-cnt:”私”という単語の出現頻度

    Args:
        df(pandas.DataFrame):加工したいデータフレーム
    Return:
        pandas.DataFrame:加工後のデータフレーム

    """
    
    df['text_len'] = df.full_text.str.len()
    df['space_count'] = df.full_text.str.count(' ')
    df['word_len_avg'] = (df.text_len - df.space_count) / (df.space_count + 1)
    df['I-cnt'] = df.full_text.str.startswith('I') + df.full_text.str.count('. I ')
    return df

In [9]:
train_df = processing(train_df)
train_df.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [10]:
test_df = processing(test_df)
test_df.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0


## テキスト翻訳

In [11]:
from googletrans import Translator

In [12]:
translator = Translator()

In [30]:
def translate_text(text):
    try:
        translated = translator.translate(text,src='en',dest='ja').text
        return translated
    except Exception as e:
            return None

In [28]:
tqdm.pandas()

In [32]:
train_df['text_ja'] = train_df.full_text.progress_apply(translate_text)

100%|██████████| 17307/17307 [5:44:03<00:00,  1.19s/it]  


In [33]:
train_df.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt,text_ja
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1,多くの人が住んでいる車を持っています。彼らが知らないことは、あなたが車を使うとき、あなたが誰...
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2,私は、火星の「顔」について議論しているNASAの科学者です。「顔」がどのように土地であるかを...
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0,人々は常に、映画で見たのと同じテクノロジー、またはソーシャルメディア全体にある最高の新しいテ...
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0,私たちは皆、地震を伴うほとんど酸素のない惑星である金星について聞いたことがあります。火山と温...
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2,親愛なる、上院議員\n\nこれは、選挙大学を維持することに賛成して主張する手紙です。「選挙大...


In [35]:
#翻訳後データフレームをcsv出力
train_df.to_csv('./data/trandlated_df.csv',index=False)

## 単語特徴量作成

### 単語の出現頻度確認

In [11]:
def check_freq(df):
    vec_count = CountVectorizer()
    vec_count.fit(df.full_text)
    X = vec_count.transform(df.full_text)
    word_df = pd.DataFrame(X.toarray(), columns=vec_count.get_feature_names_out())
    df = pd.concat([df, word_df], axis=1)
    word_df = pd.DataFrame(word_df.sum(axis=0).sort_values(ascending=False).reset_index())
    word_df.columns = ['word', 'count']
    return df,word_df

In [12]:
#訓練データのテキストを単語に分解
train_df, word_df_train = check_freq(train_df)
display(train_df.head())
display(word_df_train.head())

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt,00,000,0000,...,zygomatiz,zygomstic,zygomtic,zygosmtic,²excerpt,¹excerpt,ät,årgument,ëvening,ö_o
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,word,count
0,the,369541
1,to,207130
2,and,146862
3,of,139419
4,that,115132


In [13]:
#テストデータのテキストを単語に分解
test_df, word_df_test = check_freq(test_df)
display(test_df.head())
display(word_df_test.head())

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt,000,12,20,2006,...,worked,working,works,world,would,year,years,you,your,yourself
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1,1,1,1,1,...,0,0,0,1,0,1,1,6,1,0
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2,0,0,0,0,...,1,2,1,0,3,0,0,5,0,0
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


Unnamed: 0,word,count
0,the,57
1,to,45
2,that,43
3,of,35
4,and,29


### ストップワード削除

In [14]:
#英語のストップワードダウンロード
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop_words_set = set(stop_words)
def drop_stopword(df,columns_set):
    
    #単語データフレーム内にどれだけストップワードが含まれるのか確認
    columns_set = set(df.word)

    #共通単語抽出
    and_set = columns_set & stop_words_set

    #単語データフレームに存在しなかったストップワード数抽出
    diff_set = stop_words_set - and_set

    #ストップワードの除去、除去前後で矛盾がないか確認
    before_count = len(df)
    df.drop(columns=list(and_set),inplace=True)
    after_count = len(df)
    if len(and_set) - (before_count - after_count) == 0:
        print('処理に問題はありません')
    else:
        print('処理後のレコード数が想定しているレコード数と一致しません')
    

    return df

In [16]:
tmp_df = drop_stopword(train_df)

## 前処理後データ出力

In [12]:
train_df.to_csv('./data/processed_train.csv', index=False)

In [13]:
test_df.to_csv('./data/processed_test.csv', index=False)