atma cup2講座を参考に作成

In [1]:
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from matplotlib_venn import venn2

In [2]:
DATA = '~/art_compe/data/'
OUT_PUT = 'out_put/2/'
os.makedirs(OUT_PUT, exist_ok=True)

In [3]:
# https://github.com/nyk510/vivid/blob/master/vivid/utils.py
from time import time

def decorate(s: str, decoration=None):
    if decoration is None:
        decoration = '★' * 20

    return ' '.join([decoration, str(s), decoration])

class Timer:
    def __init__(self, logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None, sep=' ', verbose=0):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None
        self.verbose = verbose

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        if self.verbose is None:
            return
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

In [5]:
import inspect

def param_to_name(params: dict, key_sep='_', key_value_sep='=') -> str:
    """
    dict を `key=value` で連結した string に変換します.
    Args:
        params:
        key_sep:
            key 同士を連結する際に使う文字列.
        key_value_sep:
            それぞれの key / value を連結するのに使う文字列.
            `"="` が指定されると例えば { 'foo': 10 } は `"foo=10"` に変換されます.
    Returns:
        文字列化した dict
    """
    sorted_params = sorted(params.items())
    return key_sep.join(map(lambda x: key_value_sep.join(map(str, x)), sorted_params))


def cachable(function):
    attr_name = '__cachefile__'
    def wrapper(*args, **kwrgs):
        force = kwrgs.pop('force', False)
        call_args = inspect.getcallargs(function, *args, **kwrgs)

        arg_name = param_to_name(call_args)
        name = attr_name + arg_name

        use_cache = hasattr(function, name) and not force

        if use_cache:
            cache_object = getattr(function, name)
        else:
            print('run')
            cache_object = function(*args, **kwrgs)
            setattr(function, name, cache_object)

        return cache_object

    return wrapper

In [6]:
@cachable
def read_csv(name):
    if '.csv' not in name:
        name = name + '.csv'
        
    return pd.read_csv(os.path.join(DATA, name))

In [7]:
train_df = read_csv('train')
test_df = read_csv('test')

run
run


特長量作成の改善

In [11]:
class AbstractBaseBlock:
    def fit(self, all_df:pd.DataFrame, input_df:pd.DataFrame, y=None):
        return self.transform(input_df)
    
    def transform(self, input_df:pd.DataFrame):
        return NotImplementedError()

In [12]:
class NumericBlock(AbstractBaseBlock):
    def transform(self, input_df):
        use_columns = [
            'dating_period',
            'dating_year_early',
            'dating_year_late'
        ]
        
        return input_df[use_columns].copy()

In [13]:
class StringLengthBlock(AbstractBaseBlock):
    def __init__(self, column):
        self.column = column
        
    def transform(self, input_df):
        out_df = pd.DataFrame()
        out_df[self.column] = input_df[self.column].str.len()
        return out_df.add_prefix('StringLength_')

In [16]:
class CountEncodingBlock(AbstractBaseBlock):
    def __init__(self, column: str):
        self.column = column
        
    def fit(self, all_df, input_df, y=None):
        vc = all_df[self.column].value_counts()
        self.count_ = vc
        return self.transform(input_df)
    
    def transform(self, input_df):
        out_df = pd.DataFrame()
        out_df[self.column] = input_df[self.column].map(self.count_)
        return out_df.add_prefix('CE_')

In [29]:
class OneHotEncoding(AbstractBaseBlock):
    def __init__(self, column, min_count=30):
        self.column = column
        self.min_count = min_count
        
    def fit(self, all_df, input_df, y=None):
        vc = all_df[self.column].value_counts()
        self.categories = vc[vc > self.min_count].index
        return self.transform(input_df)
    
    def transform(self, input_df):
        out_df = pd.DataFrame()
        cat = pd.Categorical(input_df[self.column], categories=self.categories)
        out_df = pd.get_dummies(cat)
        out_df.columns = out_df.columns.tolist()
        return out_df.add_prefix(f'{self.column}=')

In [30]:
feature_blocks = [
    *[OneHotEncoding(c, min_count=20) for c in ['title', 'description', 'long_title',
       'principal_maker', 'principal_or_first_maker', 'sub_title',
       'copyright_holder', 'more_title', 'acquisition_method',
       'acquisition_date', 'acquisition_credit_line', 'dating_presenting_date',
       'dating_sorting_date', 'dating_period', 'dating_year_early',
       'dating_year_late',]],
    *[CountEncodingBlock(c) for c in ['art_series_id', 'title', 'description', 'long_title',
       'principal_maker', 'principal_or_first_maker', 'sub_title',
       'copyright_holder', 'more_title', 'acquisition_method',
       'acquisition_date', 'acquisition_credit_line', 'dating_presenting_date',
       'dating_sorting_date', 'dating_period', 'dating_year_early',
       'dating_year_late',]],
    *[StringLengthBlock(c) for c in [
        'title', 'description', 'long_title',
       'principal_maker', 'principal_or_first_maker', 'sub_title',
    ]]
]

In [31]:
def run_blocks(all_df, input_df, blocks, y=None, test=False):
    out_df = pd.DataFrame()
    
    print(decorate('start run blocks...'))
    
    with Timer(prefix='run test={}'.format(test)):
        for block in feature_blocks:
            with Timer(prefix='\t={}'.format(str(block))):
                if not test:
                    out_i = block.fit(all_df, input_df, y=y)
                else:
                    out_i = block.transform(input_df)
                    
            assert len(input_df) == len(out_i), block
            name = block.__class__.__name__
            out_df = pd.concat([out_df, out_i.add_suffix(f'@{name}')], axis=1)
            
    return out_df

In [32]:
all_df = pd.concat([train_df, test_df], ignore_index=True)
run_blocks(all_df, train_df, blocks=feature_blocks)
run_blocks(all_df, test_df, blocks=feature_blocks, test=True)

★★★★★★★★★★★★★★★★★★★★ start run blocks... ★★★★★★★★★★★★★★★★★★★★
	=<__main__.OneHotEncoding object at 0x7f1beca806a0> 0.016[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80880> 0.020[s]
	=<__main__.OneHotEncoding object at 0x7f1beca808e0> 0.016[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80a90> 0.030[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80af0> 0.014[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80b50> 0.013[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80bb0> 0.004[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80c70> 0.016[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80cd0> 0.004[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80d30> 0.009[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80d90> 0.006[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80df0> 0.015[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80e50> 0.012[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80eb0> 0.003[s]
	=<__main__.OneHotEncoding object at 0x7f1beca80f10> 0.014[s]
	=<__mai

Unnamed: 0,title=Portret van een onbekende vrouw@OneHotEncoding,title=Portret van een onbekende man@OneHotEncoding,title=Portret van een onbekende jonge vrouw@OneHotEncoding,title=Portrait of a Man@OneHotEncoding,title=Portrait of a Woman@OneHotEncoding,title=Portret van een vrouw@OneHotEncoding,title=Portret van een man@OneHotEncoding,title=Portret van een onbekende jongen@OneHotEncoding,title=Self-Portrait@OneHotEncoding,description=Deze foto maakt deel uit van een album.@OneHotEncoding,...,CE_dating_sorting_date@CountEncodingBlock,CE_dating_period@CountEncodingBlock,CE_dating_year_early@CountEncodingBlock,CE_dating_year_late@CountEncodingBlock,StringLength_title@StringLengthBlock,StringLength_description@StringLengthBlock,StringLength_long_title@StringLengthBlock,StringLength_principal_maker@StringLengthBlock,StringLength_principal_or_first_maker@StringLengthBlock,StringLength_sub_title@StringLengthBlock
0,0,0,0,0,0,0,0,0,0,0,...,458.0,10135,458.0,765.0,51,,84,18,18,17.0
1,0,0,0,0,0,0,0,0,0,0,...,45.0,6022,45.0,128.0,59,230.0,120,26,26,19.0
2,0,0,0,0,0,0,0,0,0,0,...,32.0,3247,32.0,82.0,36,97.0,59,15,15,17.0
3,0,0,0,0,0,0,0,0,0,0,...,266.0,10135,266.0,380.0,64,309.0,100,21,21,17.0
4,0,0,0,0,0,0,0,0,0,0,...,203.0,6022,203.0,100.0,52,334.0,70,10,10,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12003,0,0,0,0,0,0,0,0,0,0,...,148.0,10135,148.0,148.0,30,,52,14,14,15.0
12004,0,0,0,0,0,0,0,0,0,0,...,298.0,10135,298.0,361.0,126,36.0,180,23,23,17.0
12005,0,0,0,0,0,0,0,0,0,0,...,107.0,10135,107.0,83.0,33,,61,13,13,11.0
12006,0,0,0,0,0,0,0,0,0,0,...,148.0,10135,148.0,148.0,35,,57,14,14,15.0
