# SIGNATE Cup 2024

## 概要

- 説明変数
    - 元データの各カラムをデータクレンジングしたもの
    - タイポフラグ
    - 欠損フラグ
- 説明変数の前処理
    - `Age` `DurationOfPitch` `MonthlyIncome` を数値変数、それ以外をカテゴリー変数をみなした。
    - 数値変数は正規化と欠損値補完 (`IterativeImputer`) を、カテゴリー変数はダミー変数化をそれぞれ施した。
    - L1正則化で絞り込んだ後、CVを見ながら手作業で変数を選択した。
- モデル
    - ロジスティック回帰モデル
- モデル選択
    - `StratifiedShuffleSplit(n_splits=100, test_size=0.5)`
- アンサンブル
    - なし
- スコア
    - CV: 0.8472193
    - Public LB: 0.8391560
    - Private LB: 0.8388465

## セットアップ

In [1]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
train_raw = pd.read_csv('rawdata/train.csv')
test_raw = pd.read_csv('rawdata/test.csv')

## 前処理

In [3]:
def cleanse(dirty):
    # データクレンジング

    clean = dirty.copy()

    clean['Age'] = (
        dirty['Age']
        .str.normalize('NFKC')
        .str.replace(r'[歳才際代]', '', regex=True)
        .replace({
            f'{c2}十{c1}'.replace('零十', '').replace('一十', '十').replace('零', ''): f'{n2 * 10 + n1}'
            for n2, c2 in enumerate('零一二三四五六七八九')
            for n1, c1 in enumerate('零一二三四五六七八九')
        })
        .astype(float)
    )
    clean['AgeTypo'] = dirty['Age'].str.extract(r'([歳才際代])')[0]
    clean['Age漢数字'] = dirty['Age'].str.contains(r'[一二三四五六七八九]', na=False)

    clean['DurationOfPitch'] = (
        dirty['DurationOfPitch']
        .str.extract(r'(\d+)([秒分])')
        .pipe(lambda df: df[0].astype(float) * np.where(df[1] == '分', 60, 1))
    )
    clean['DurationOfPitch分'] = dirty['DurationOfPitch'].str.contains('分', na=False)

    clean['Gender'] = (
        dirty['Gender']
        .str.normalize('NFKC')
        .str.lower()
        .str.replace(' ', '')
    )
    clean['Gender全角'] = dirty['Gender'].str.normalize('NFKC') != dirty['Gender']
    clean['GenderSpace'] = dirty['Gender'].str.contains('\s')

    clean['NumberOfFollowups'] = dirty['NumberOfFollowups'].mask(lambda s: s >= 100, lambda s: s / 100)
    clean['NumberOfFollowupsTypo'] = dirty['NumberOfFollowups'] >= 100

    clean['ProductPitched'] = dirty['ProductPitched'].apply(lambda x: len(x))
    clean['ProductPitchedTypo'] = (
        dirty['ProductPitched']
        .str.lower()
        .str.contains(r'[^a-z\s]')
    )

    clean['NumberOfTrips'] = (
        dirty['NumberOfTrips']
        .str.extract(r'([^\d]*)(\d)回?')
        .pipe(lambda df: np.select([df[0] == '年に', df[0] == '半年に', df[0] == '四半期に'], [1, 2, 4], 1) * df[1].astype(float))
    )
    clean['NumberOfTripsFreq'] = (
        dirty['NumberOfTrips']
        .str.extract(r'^([^\d]*)')[0]
        .mask(lambda s: s == '', np.nan)
    )

    clean['Designation'] = dirty['Designation'].apply(lambda x: len(x))
    clean['DesignationTypo'] = dirty['Designation'].str.lower().str.contains(r'[^a-z\s]')

    clean['MonthlyIncome'] = (
        dirty['MonthlyIncome']
        .str.extract(r'([\d\.]+)([^\d\.]*)')
        .pipe(lambda df: df[0].astype(float) * np.where(df[1] == '', 1, 10000))
    )
    clean['MonthlyIncome万円'] = dirty['MonthlyIncome'].str.contains(r'[^\d\.]', na=False)

    customer_info_split = (
        dirty['customer_info']
        .str.normalize('NFKC')
        .str.split(r'[、,/\s]', n=3)
    )
    clean['MaritalStatus'] = customer_info_split.apply(lambda x: x[0])
    clean['OwnCar'] = np.where(customer_info_split.apply(lambda x: x[1]).str.contains('なし|未'), 0, 1)
    clean['NumberOfChildrenVisiting'] = (
        customer_info_split
        .apply(lambda x: x[2])
        .mask(lambda s: s == '子供有り', '1.4')  # 人数がわからない場合、平均人数を格納する。
        .mask(lambda s: s.str.contains('ゼロ|なし|無|非'), '0')
        .str.extract('([\d\.]+)')[0]
        .astype(float)
    )

    return clean

In [4]:
def trim(traintest):
    # 似た傾向を持つ値をまとめる。

    traintest = traintest.copy()

    traintest['Age'] = traintest['Age'].mask(traintest['Age'] >= 40, 40)
    traintest['MonthlyIncome'] = traintest['MonthlyIncome'].mask(traintest['MonthlyIncome'] >= 350000, 350000)

    return traintest

In [5]:
def get_dummies(traintest):
    # カテゴリー変数をダミー変数化する。

    traintest = pd.get_dummies(
        traintest,
        dummy_na=True,
        columns = [
            'AgeTypo',
            'TypeofContact',
            'CityTier',
            'Occupation',
            'Gender',
            'NumberOfPersonVisiting',
            'NumberOfFollowups',
            'ProductPitched',
            'PreferredPropertyStar',
            'NumberOfTrips',
            'NumberOfTripsFreq',
            'PitchSatisfactionScore',
            'Designation',
            'MaritalStatus',
            'NumberOfChildrenVisiting'
        ]
    )
    traintest = traintest.rename(columns = lambda column: column.replace('.0', ''))

    return traintest

In [6]:
def standardize(traintest):
    # 数値変数を正規化する。

    traintest = traintest.copy()

    scale = lambda s: (s - s.mean()) / s.std()
    traintest['Age'] = (traintest['Age'] - 30) / 20
    traintest['DurationOfPitch'] = scale(traintest['DurationOfPitch'])
    traintest['MonthlyIncome'] = (traintest['MonthlyIncome'] - 300000) / 100000

    return traintest

In [7]:
def dummyna_fillna(traintest):
    # 欠損フラグを立てたうえで、欠損値を補完する。

    traintest = traintest.copy()

    for column in traintest.columns.drop(['ProdTaken']):
        if traintest[column].isna().any():
            traintest[f'{column}_nan'] = traintest[column].isna()

    columns = traintest.columns.drop(['id', 'customer_info', 'ProdTaken', 'task'])
    imputer = IterativeImputer(max_iter=10, random_state=0)
    traintest[columns] = imputer.fit_transform(traintest[columns])

    return traintest

In [8]:
traintest = pd.concat([
    train_raw.assign(task='train'),
    test_raw.assign(task='test')
], ignore_index=True)

traintest = cleanse(traintest)
traintest = trim(traintest)
traintest = get_dummies(traintest)
traintest = standardize(traintest)
traintest = dummyna_fillna(traintest)

train = traintest.loc[lambda df: df['task'] == 'train', :].drop(columns = ['task'])
test = traintest.loc[lambda df: df['task'] == 'test', :].drop(columns = ['task'])

## 学習

### 変数選択

In [9]:
# 説明変数
feature_columns = [
    'Age',
    'AgeTypo_際',
    'AgeTypo_代',
    'TypeofContact_Company Invited',
    'CityTier_1',
    'Occupation_Large Business',
    'Gender_male',
    'NumberOfPersonVisiting_4',
    'NumberOfFollowups_1',
    'NumberOfFollowups_6',
    'NumberOfFollowupsTypo',
    'ProductPitched_6',
    'PreferredPropertyStar_4',
    'NumberOfTrips_2',
    'NumberOfTrips_5',
    'NumberOfTrips_7',
    'NumberOfTrips_nan',
    'Passport',
    'PitchSatisfactionScore_1',
    'Designation_9',
    'MonthlyIncome',
    'MaritalStatus_独身',
    'OwnCar',
    'NumberOfChildrenVisiting_2',
    'NumberOfChildrenVisiting_3',
    'NumberOfChildrenVisiting_nan'
]

# 目的変数
target_column = 'ProdTaken'

### クロスバリデーション

In [10]:
folds = StratifiedShuffleSplit(n_splits=100, test_size=0.5, random_state=0)
cv_models = []
cv_scores = []
cv_trains = []
cv_tests = []
for i, (cv_train_index, cv_test_index) in enumerate(folds.split(train[feature_columns], train[target_column])):
    cv_train = train.iloc[cv_train_index, :].copy()
    cv_test = train.iloc[cv_test_index, :].copy()

    cv_model = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, random_state=0)
    cv_model = cv_model.fit(cv_train[feature_columns], cv_train[target_column])
    cv_models.append(cv_model)

    cv_test[f'{target_column}_pred'] = cv_model.predict_proba(cv_test[feature_columns])[:, 1]
    cv_score = roc_auc_score(cv_test[target_column], cv_test[f'{target_column}_pred'])
    cv_scores.append(cv_score)

    cv_trains.append(cv_train)
    cv_tests.append(cv_test)

print(f'{np.mean(cv_scores)} ± {np.std(cv_scores)}')

0.8472193184502718 ± 0.008458509561811087


In [11]:
# アンサンブルした場合のスコア
cv_tests_all = pd.concat([cv_test.assign(k = k) for k, cv_test in enumerate(cv_tests)])
cv_tests_all['ProdTaken_pred_rank'] = cv_tests_all.groupby('k')['ProdTaken_pred'].rank()
roc_auc_score(
    cv_tests_all.groupby('id')['ProdTaken'].mean(),
    cv_tests_all.groupby('id')['ProdTaken_pred_rank'].mean()
)

0.8524458919291147

In [12]:
# 回帰係数
(
    pd.concat([
        pd.DataFrame({
            'feature': cv_model.feature_names_in_,
            'coef': cv_model.coef_[0]
        })
        for cv_model in cv_models
    ])
    .groupby('feature', as_index=False)['coef'].agg(['mean', 'std'])
    .sort_values('feature', ignore_index=True)
    .style.bar(vmin=-1.0, vmax=1.0)
)

Unnamed: 0,feature,mean,std
0,Age,-0.95676,0.184319
1,AgeTypo_代,-0.635645,0.203097
2,AgeTypo_際,-0.422092,0.278991
3,CityTier_1,-0.83115,0.122159
4,Designation_9,0.832685,0.159214
5,Gender_male,0.707231,0.123586
6,MaritalStatus_独身,0.802947,0.125527
7,MonthlyIncome,-1.002011,0.208837
8,NumberOfChildrenVisiting_2,-0.340799,0.142256
9,NumberOfChildrenVisiting_3,-0.374357,0.318478


### 予測用モデルの学習

In [13]:
model = LogisticRegression(solver='liblinear', penalty='l2', C=1.0, random_state=0)
model = model.fit(train[feature_columns], train[target_column])

In [14]:
# 回帰係数
(
    pd.DataFrame({
        'feature': model.feature_names_in_,
        'coef': model.coef_[0]
    })
    .sort_values('feature', ignore_index=True)
    .style.bar(vmin=-1.0, vmax=1.0)
)

Unnamed: 0,feature,coef
0,Age,-0.950265
1,AgeTypo_代,-0.659465
2,AgeTypo_際,-0.415532
3,CityTier_1,-0.818671
4,Designation_9,0.852965
5,Gender_male,0.703024
6,MaritalStatus_独身,0.809665
7,MonthlyIncome,-0.997521
8,NumberOfChildrenVisiting_2,-0.340442
9,NumberOfChildrenVisiting_3,-0.484049


## 予測

In [15]:
test[target_column] = model.predict_proba(test[feature_columns])[:, 1]

## 提出

In [16]:
# submit = test[['id', target_column]].copy()
# submit.to_csv('submit/submit_20240901_04.csv', index=False, header=False)