In [28]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from pandas_profiling import ProfileReport # profile report を作る用
from matplotlib_venn import venn2 # venn図を作成する用

In [29]:
INPUT_DIR = '../datasets/'
OUTPUT_DIR = '../outputs/'

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12026 entries, 0 to 12025
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   object_id                 12026 non-null  object 
 1   art_series_id             12026 non-null  object 
 2   title                     12026 non-null  object 
 3   description               8506 non-null   object 
 4   long_title                12026 non-null  object 
 5   principal_maker           12026 non-null  object 
 6   principal_or_first_maker  12025 non-null  object 
 7   sub_title                 11992 non-null  object 
 8   copyright_holder          713 non-null    object 
 9   more_title                11891 non-null  object 
 10  acquisition_method        11830 non-null  object 
 11  acquisition_date          11009 non-null  object 
 12  acquisition_credit_line   3525 non-null   object 
 13  dating_presenting_date    12016 non-null  object 
 14  dating

In [31]:
# 数値型のカラムはそのまま使う
def create_numeric_feature(input_df):
    use_columns = [
        'dating_sorting_date',
        'dating_year_early',
        'dating_year_late'
    ]
    return input_df[use_columns].copy()

In [32]:
create_numeric_features(train_df)

Unnamed: 0,dating_sorting_date,dating_year_early,dating_year_late
0,1660.0,1660.0,1685.0
1,1900.0,1900.0,1930.0
2,1860.0,1860.0,1880.0
3,1850.0,1850.0,1879.0
4,1825.0,1825.0,1874.0
...,...,...,...
12021,1900.0,1900.0,1920.0
12022,1701.0,1701.0,1714.0
12023,1778.0,1778.0,1778.0
12024,1689.0,1689.0,1690.0


In [33]:
# テキスト型のカラムは数値変換する
# タイトル→文字数へ
train_df['title'].str.len()

def create_string_length_feature(input_df):
    out_df = pd.DataFrame() #出力用
    
    string_columns = [
        'title',
        'description',
        'long_title',
        'sub_title',
#         'copyright_holder',
        'more_title'
    ]
    
    for c in string_columns:
        out_df[c] = input_df[c].str.len()
        
    return out_df.add_prefix('StringLength__')

create_string_length_feature(train_df)

Unnamed: 0,StringLength__title,StringLength__description,StringLength__long_title,StringLength__sub_title,StringLength__more_title
0,21,,48,39.0,21.0
1,15,,64,17.0,15.0
2,21,46.0,52,15.0,21.0
3,16,71.0,53,25.0,16.0
4,27,28.0,51,27.0,27.0
...,...,...,...,...,...
12021,37,,65,16.0,48.0
12022,60,231.0,89,17.0,424.0
12023,54,101.0,85,17.0,141.0
12024,29,495.0,72,17.0,145.0


In [34]:
def create_count_encoding_feature(input_df):
    use_columns = [
#         'acquisition_method',
#         'title',
        'principal_maker',
        'principal_or_first_maker',
        'acquisition_credit_line'
    ]

    out_df = pd.DataFrame()
    for column in use_columns:
        vc = train_df[column].value_counts()
        out_df[column] = input_df[column].map(vc)

    return out_df.add_prefix('CE_')

In [35]:
def create_one_hot_encoding(input_df):
    use_columns = [
        'acquisition_method',
        'copyright_holder'
    ]
    out_df = pd.DataFrame()
    for column in use_columns:

        # あまり巨大な行列にならないよう, 出現回数が 20 回を下回るカテゴリは考慮しない
        vc = train_df[column].value_counts()
        vc = vc[vc > 20]

        # 明示的に catgories を指定して, input_df によらず列の大きさが等しくなるようにする
        cat = pd.Categorical(input_df[column], categories=vc.index)

        # このタイミングで one-hot 化
        out_i = pd.get_dummies(cat)
        # column が Catgory 型として認識されているので list にして解除する (こうしないと concat でエラーになる)
        out_i.columns = out_i.columns.tolist()
        out_i = out_i.add_prefix(f'{column}=')
        out_df = pd.concat([out_df, out_i], axis=1)
    return out_df

In [36]:
from tqdm import tqdm

def to_feature(input_df):
    """input_df を特徴量行列に変換した新しいデータフレームを返す.
    """

    processors = [
        create_numeric_feature,
        create_string_length_feature,
        create_count_encoding_feature,
        create_one_hot_encoding
    ]

    out_df = pd.DataFrame()

    for func in tqdm(processors, total=len(processors)):
        with Timer(prefix='create' + func.__name__ + ' '):
            _df = func(input_df)

        # 長さが等しいことをチェック (ずれている場合, func の実装がおかしい)
        assert len(_df) == len(input_df), func.__name__
        out_df = pd.concat([out_df, _df], axis=1)

    return out_df

In [37]:
# https://github.com/nyk510/vivid/blob/master/vivid/utils.py
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' '):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [38]:
# 特徴量前処理
train_feat_df = to_feature(train_df)
test_feat_df = to_feature(test_df)

  0%|          | 0/4 [00:00<?, ?it/s]

createcreate_numeric_feature  0.001[s]
createcreate_string_length_feature  0.030[s]


100%|██████████| 4/4 [00:00<00:00, 58.84it/s]
100%|██████████| 4/4 [00:00<00:00, 76.73it/s]

createcreate_count_encoding_feature  0.017[s]
createcreate_one_hot_encoding  0.009[s]
createcreate_numeric_feature  0.001[s]
createcreate_string_length_feature  0.024[s]
createcreate_count_encoding_feature  0.012[s]
createcreate_one_hot_encoding  0.007[s]





In [42]:
test_feat_df.shape

(12008, 26)

In [43]:
train_feat_df.columns

Index(['dating_sorting_date', 'dating_year_early', 'dating_year_late',
       'StringLength__title', 'StringLength__description',
       'StringLength__long_title', 'StringLength__sub_title',
       'StringLength__more_title', 'CE_principal_maker',
       'CE_principal_or_first_maker', 'CE_acquisition_credit_line',
       'acquisition_method=purchase', 'acquisition_method=transfer',
       'acquisition_method=gift', 'acquisition_method=unknown',
       'acquisition_method=bequest', 'acquisition_method=loan',
       'acquisition_method=nationalization 1795',
       'copyright_holder=erven Richard Tepe',
       'copyright_holder=Willem Diepraam', 'copyright_holder=Jacob R. Olie',
       'copyright_holder=Stichting Kessler-de Lange',
       'copyright_holder=erven Théodore van Lelyveld',
       'copyright_holder=Jozef van Ruyssevelt',
       'copyright_holder=erven Cor Jaring', 'copyright_holder=Jan Banning'],
      dtype='object')

In [40]:
# アサーションの条件がTrueの時は何も起きない
assert len(test_feat_df) == len(test_df)

In [41]:
# 前処理した特徴量をcsvに出力保存しておく
train_feat_df.to_csv("../datasets/pre1_train.csv")
test_feat_df.to_csv("../datasets/pre1_test.csv")