In [1]:
import pandas as pd
import numpy as np

import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('../awesomebook-master/data/customer.csv')
  hotel_tb = pd.read_csv('../awesomebook-master/data/hotel.csv')
  reserve_tb = pd.read_csv('../awesomebook-master/data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('../awesomebook-master/data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('../awesomebook-master/data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('../awesomebook-master/data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('../awesomebook-master/data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

In [None]:
customer_tb[['sex_is_man']] = (customer_tb[['sex']] == 'man').astype('bool')

customer_tb['sex_c'] = \
  pd.Categorical(customer_tb['sex'], categories=['man', 'woman'])

# customer_tb['sex_c'] = customer_tb['sex_c'].astype('category')

customer_tb['sex_c'].cat.codes

customer_tb['sex_c'].cat.categories


Index(['man', 'woman'], dtype='object')

In [None]:
customer_tb['sex'] = pd.Categorical(customer_tb['sex'])

dummy_vars = pd.get_dummies(customer_tb['sex'], drop_first=False)

dummy_vars

Unnamed: 0,man,woman
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
995,True,False
996,True,False
997,False,True
998,False,True


In [None]:
customer_tb['age_rank'] = \
  pd.Categorical(np.floor(customer_tb['age']/10)*10)

#customer_tb['age_rank'].cat.add_categories(['60 이상'], inplace=True)
customer_tb['age_rank'] = customer_tb['age_rank'].cat.add_categories(['60 이상'])


customer_tb.loc[customer_tb['age_rank'] \
           .isin([60.0, 70.0, 80.0]), 'age_rank'] = '60 이상'

customer_tb['age_rank'] = customer_tb['age_rank'].cat.remove_unused_categories()


In [None]:

customer_tb['sex_and_age'] = pd.Categorical(
  customer_tb[['sex', 'age']]

    .apply(lambda x: '{}_{}'.format(x[0], np.floor(x[1] / 10) * 10),
           axis=1)
)
customer_tb['sex_and_age']

  .apply(lambda x: '{}_{}'.format(x[0], np.floor(x[1] / 10) * 10),


0        man_40.0
1        man_30.0
2      woman_40.0
3        man_40.0
4        man_30.0
          ...    
995      man_40.0
996      man_30.0
997    woman_30.0
998    woman_40.0
999      man_30.0
Name: sex_and_age, Length: 1000, dtype: category
Categories (14, object): ['man_20.0', 'man_30.0', 'man_40.0', 'man_50.0', ..., 'woman_50.0', 'woman_60.0', 'woman_70.0', 'woman_80.0']

In [None]:

production = load_production()

fault_cnt_per_type = production \
  .query('fault_flg') \
  .groupby('type')['fault_flg'] \
  .count()

type_cnt = production.groupby('type')['fault_flg'].count()

production['type_fault_rate'] = production[['type', 'fault_flg']] \
  .apply(lambda x:
         (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1),
         axis=1)


  (fault_cnt_per_type[x[0]] - int(x[1])) / (type_cnt[x[0]] - 1),


In [None]:
production_missc_tb = load_production_missing_category()

from sklearn.neighbors import KNeighborsClassifier

production_missc_tb.replace('None', np.nan, inplace=True)

train = production_missc_tb.dropna(subset=['type'], inplace=False)

test = production_missc_tb \
  .loc[production_missc_tb.index.difference(train.index), :]

kn = KNeighborsClassifier(n_neighbors=3)

kn.fit(train[['length', 'thickness']], train['type'])

test['type'] = kn.predict(test[['length', 'thickness']])
