データを加工するノートブック

# データ読み込み

In [17]:
import pandas as pd

In [18]:
df_train = pd.read_csv('./data/train.csv')
df_train

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [19]:
df_test = pd.read_csv('./data/test.csv')
df_test.head()

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [20]:
df_train.isnull().sum()

essay_id     0
full_text    0
score        0
dtype: int64

In [22]:
df.dtypes.to_frame().T

Unnamed: 0,essay_id,full_text,score
0,object,object,int64


# 特徴量作成

In [25]:
def processing(df):
    """特徴量作成関数

    特徴量の説明
        text_len:テキストの長さ
        space_count:空白の数
        word_len_avg:一節の平均的な長さ
        I-cnt:”私”という単語の出現頻度

    Args:
        df(pandas.DataFrame):加工したいデータフレーム
    Return:
        pandas.DataFrame:加工後のデータフレーム

    """
    
    df['text_len'] = df.full_text.str.len()
    df['space_count'] = df.full_text.str.count(' ')
    df['word_len_avg'] = (df.text_len - df.space_count) / (df.space_count + 1)
    df['I-cnt'] = df.full_text.str.startswith('I') + df.full_text.str.count('. I ')
    return df

In [26]:
df_train = processing(df_train)
df_train.head()

Unnamed: 0,essay_id,full_text,score,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,3,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,3,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,4,3077,555,4.535971,0
3,001bdc0,"We all heard about Venus, the planet without a...",4,2701,446,5.044743,0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2208,380,4.7979,2


In [27]:
df_test = processing(df_test)
df_test.head()

Unnamed: 0,essay_id,full_text,text_len,space_count,word_len_avg,I-cnt
0,000d118,Many people have car where they live. The thin...,2677,496,4.38833,1
1,000fe60,I am a scientist at NASA that is discussing th...,1669,327,4.091463,2
2,001ab80,People always wish they had the same technolog...,3077,555,4.535971,0
