In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

In [3]:
# データの読み込み
df_train_org = pd.read_csv('./data/train.csv')
df_test_org = pd.read_csv('./data/test.csv')

In [4]:
df_train = df_train_org
df_test = df_test_org

In [5]:
df_train.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,26,Travel_Rarely,450.941476,Research & Development,7.601074,3,Medical,1291,4,...,80,0,1,3,2,11,8,7,0,0
1,3,47,Travel_Rarely,730.235896,Research & Development,26.739489,3,Technical Degree,1587,2,...,80,1,20,1,1,5,9,4,4,0
2,7,26,Travel_Rarely,1082.560066,Human Resources,7.374739,3,Life Sciences,1591,2,...,80,1,4,3,3,3,2,0,2,0
3,10,46,Travel_Rarely,706.247579,Research & Development,14.791373,1,Medical,1572,1,...,80,1,6,0,1,5,4,7,7,1
4,11,25,Travel_Rarely,500.61086,Research & Development,2.146966,4,Medical,981,2,...,80,1,8,3,3,1,0,0,0,0


In [6]:
print(df_train.columns)
print(df_train.info())

Index(['id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'Over18', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'Attrition'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1200 non-null   int64  
 1   Age                       1200 non-nul

## データ加工

In [7]:
# https://www.kaggle.com/code/yairhadad1/kickstarter-predict-project-success-roc-0-78/notebook
# 上記のノートブックを参考にしてる解法
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [8]:
# https://kakakakakku.hatenablog.com/entry/2021/11/29/084330
# Pipelineを使うことで、前処理やML学習などを１つのオブジェクトとして扱え、可読性高く実装できる。
from sklearn.pipeline import Pipeline

# # split
# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train)

# # name
# from sklearn.feature_extraction.text import CountVectorizer  
# name_pipeline = Pipeline([('name_column', ItemSelector(key = 'name')),
#                           ('vectorizer', CountVectorizer(max_features=100))])

# dummeys
from sklearn.preprocessing import OneHotEncoder
one_hot_pipeline = Pipeline([('hot_columns', ItemSelector(key = ['BusinessTravel','Department',
                                                                 'EducationField', 'Gender', 'JobRole',
                                                                'MaritalStatus'])),
                             ('oneHowEncoder', OneHotEncoder(handle_unknown='ignore',sparse=True))])

# # min max scaler
# from sklearn.preprocessing import MinMaxScaler
# min_max_pipeline = Pipeline([('min_max_columns', ItemSelector(key = ['project_time','word_count',
#                                                                      'goal_per_day', 'n_words', 'name_len'])),
#                              ('minMaxScaler', MinMaxScaler())])

# Binning（連続的な特徴量を離散化する）
from sklearn.preprocessing import KBinsDiscretizer
k_bins_pipeline = Pipeline([('goal', ItemSelector(key = ['Age', 'DailyRate', 'HourlyRate', 'MonthlyIncome',
                                                         'DistanceFromHome', 'EmployeeNumber', 'TotalWorkingYears',
                                                         'YearsAtCompany', 'YearsInCurrentRole',
                                                         'YearsSinceLastPromotion', 'YearsWithCurrManager'])),
                             ('k_bins', KBinsDiscretizer(n_bins = 4,encode = 'onehot',strategy = 'quantile'))])

# FeatureUnion
from sklearn.pipeline import FeatureUnion
feature_pipeline = FeatureUnion([('one_hot',one_hot_pipeline),
                                 # ('name',name_pipeline),
                                 # ('min_max',min_max_pipeline),
                                 ('k_bins',k_bins_pipeline)])

feature_pipeline.fit(df_train)

x_train = feature_pipeline.transform(df_train)
x_test = feature_pipeline.transform(df_test)


In [10]:
x_train

<1200x69 sparse matrix of type '<class 'numpy.float64'>'
	with 20400 stored elements in Compressed Sparse Row format>

# 加工データの保存

In [27]:
# df.to_csv('./data/input_01.csv')
# df.to_csv('./data/input_02.csv')
# profile.to_file("./data/sig740/KS_ProfileReport.html")
df_train.to_csv('../data/sig740/preprocessed_train.csv')
df_test.to_csv('../data/sig740/preprocessed_test.csv')

### input_01.csv
- 初期

### input_02.csv
- input_01
- main_category x category -> class

### input_03.csv
- input_01
- main_category x category -> class
- launched -> year/month/day

### input_04.csv・・・未作成（name除去だけなら3_学習でやればよい）
- input_03
- nameを除去