In [1]:
import pandas as pd
import numpy as np

import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('../awesomebook-master/data/customer.csv')
  hotel_tb = pd.read_csv('../awesomebook-master/data/hotel.csv')
  reserve_tb = pd.read_csv('../awesomebook-master/data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('../awesomebook-master/data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('../awesomebook-master/data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('../awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('../awesomebook-master/data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('../awesomebook-master/data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

In [14]:
import pandas as pd

print(type(40000 / 3))

print(int(40000 / 3))

print(float(40000 / 3))

df = pd.DataFrame({'value': [40000 / 3]})

print(df.dtypes)

print(df['value'].astype('int8'))
print(df['value'].astype('int16'))
print(df['value'].astype('int32'))
print(df['value'].astype('int64'))

print(df['value'].astype('float16'))
print(df['value'].astype('float32'))
print(df['value'].astype('float64'))
print(df['value'].astype('float128'))

print(df['value'].astype(int))
print(df['value'].astype(float))


<class 'float'>
13333
13333.333333333334
value    float64
dtype: object
0    21
Name: value, dtype: int8
0    13333
Name: value, dtype: int16
0    13333
Name: value, dtype: int32
0    13333
Name: value, dtype: int64
0    13336.0
Name: value, dtype: float16
0    13333.333008
Name: value, dtype: float32
0    13333.333333
Name: value, dtype: float64
0    13333.333333
Name: value, dtype: float128
0    13333
Name: value, dtype: int64
0    13333.333333
Name: value, dtype: float64




In [4]:
reserve_tb['total_price_log'] = \
  reserve_tb['total_price'].apply(lambda x: np.log10(x / 1000 + 1))

reserve_tb['total_price_log']

0       1.992111
1       1.334454
2       1.539076
3       2.290925
4       1.839478
          ...   
4025    1.230449
4026    1.631444
4027    1.879669
4028    2.733197
4029    1.654177
Name: total_price_log, Length: 4030, dtype: float64

In [5]:
customer_tb['age_rank'] = \
  (np.floor(customer_tb['age'] / 10) * 10).astype('category')

print(customer_tb['age_rank'])


0      40.0
1      30.0
2      40.0
3      40.0
4      30.0
       ... 
995    40.0
996    30.0
997    30.0
998    40.0
999    30.0
Name: age_rank, Length: 1000, dtype: category
Categories (7, float64): [20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0]


In [6]:
from sklearn.preprocessing import StandardScaler

reserve_tb['people_num'] = reserve_tb['people_num'].astype(float)

ss = StandardScaler()

result = ss.fit_transform(reserve_tb[['people_num', 'total_price']])

reserve_tb['people_num_normalized'] = [x[0] for x in result]
reserve_tb['total_price_normalized'] = [x[1] for x in result]

reserve_tb['people_num_normalized']

0       1.300709
1      -0.483753
2      -0.483753
3       1.300709
4       0.408478
          ...   
4025   -0.483753
4026   -0.483753
4027   -0.483753
4028    1.300709
4029   -1.375984
Name: people_num_normalized, Length: 4030, dtype: float64

In [7]:
reserve_tb = reserve_tb[
  (abs(reserve_tb['total_price'] - np.mean(reserve_tb['total_price'])) /
   np.std(reserve_tb['total_price']) <= 3)
].reset_index()


In [8]:
production_tb = load_production()

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

pca_values = pca.fit_transform(production_tb[['length', 'thickness']])

print('누적 기여울: {0}'.format(sum(pca.explained_variance_ratio_)))
print('각 차원의 기여율: {0}'.format(pca.explained_variance_ratio_))

pca_newvalues = pca.transform(production_tb[['length', 'thickness']])

pca_newvalues

누적 기여울: 0.9999999999999999
각 차원의 기여율: [0.97897794 0.02102206]


array([[  76.96838157,   13.38906936],
       [-112.11469337,    8.24884796],
       [ -76.1994339 ,  -11.19027127],
       ...,
       [  31.12100559,  -15.48152593],
       [-117.87675543,   -2.4361334 ],
       [   4.80243541,   15.32174872]], shape=(1000, 2))

In [9]:
production_miss_num = load_production_missing_num()

production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num.dropna(subset=['thickness'], inplace=True)

In [10]:
production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num['thickness'].fillna(1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(1, inplace=True)


In [11]:
production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num['thickness'] = \
  production_miss_num['thickness'].astype('float64')

thickness_mean = production_miss_num['thickness'].mean()

production_miss_num['thickness'].fillna(thickness_mean, inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(thickness_mean, inplace=True)
