...

In [2]:
# seaborn 라이브러리의 tips 데이터 집합 확인
import pandas as pd
import seaborn as sns

In [3]:
tips = sns.load_dataset('tips')

print(tips.shape)
print(tips.columns)

(244, 7)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


In [18]:
# 협연여부 데이터를 문자열로 변환하여 저장
tips['smoker_str'] = tips['smoker'].astype(str)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [5]:
# 전체금액 데이터를 문자열로 변환
tips['total_bill'] = tips['total_bill'].astype(str)
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [6]:
# 전체금액 데이터를 실수로 변환
tips['total_bill'] = tips['total_bill'].astype(float)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [7]:
# 잘못입력한 데이터 처리하기

# 숫자형태의 데이터에 문자열을 입력하면 object 형태로 자동 변경됨
# astype()으로 숫자 변환 시에는 문자 데이터 처리가 불가능함

#ex_df.loc[1, 'age'] = 'empty'
#print(ex_df.dtypes)

In [8]:
# ex_df['age'] = ex_df['age'].astype(int)

In [9]:
tips.loc[[1,3,5,7],['total_bill']] = 'missing'

In [10]:
tips['total_bill'].astype

<bound method NDFrame.astype of 0        16.99
1      missing
2        21.01
3      missing
4        24.59
        ...   
239      29.03
240      27.18
241      22.67
242      17.82
243      18.78
Name: total_bill, Length: 244, dtype: object>

In [11]:
# float 데이터에 문자열을 입력하여 object 데이터로 자동 변환
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7], 'total_bill'] = 'missing'

print(tips_sub_miss.dtypes)
print(tips_sub_miss)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object
  total_bill   tip     sex smoker  day    time  size smoker_str
0      16.99  1.01  Female     No  Sun  Dinner     2         No
1    missing  1.66    Male     No  Sun  Dinner     3         No
2      21.01  3.50    Male     No  Sun  Dinner     3         No
3    missing  3.31    Male     No  Sun  Dinner     2         No
4      24.59  3.61  Female     No  Sun  Dinner     4         No
5    missing  4.71    Male     No  Sun  Dinner     4         No
6       8.77  2.00    Male     No  Sun  Dinner     2         No
7    missing  3.12    Male     No  Sun  Dinner     4         No
8      15.04  1.96    Male     No  Sun  Dinner     2         No
9      14.78  3.23    Male     No  Sun  Dinner     2         No


In [12]:
tips = sns.load_dataset('tips')
tips['total_bill'] = tips['total_bill'].astype(str)
tips.dtypes

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [13]:
tips['total_bill'] = pd.to_numeric(tips['total_bill'])
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [14]:
# 잘못된 값을 누락값으로 변환
pd.to_numeric(tips_sub_miss['total_bill'], errors='coerce')

0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float64

In [15]:
# 잘못된 값 무시
pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')

0      16.99
1    missing
2      21.01
3    missing
4      24.59
5    missing
6       8.77
7    missing
8      15.04
9      14.78
Name: total_bill, dtype: object

#### 카테고리 자료형의 장점과 특징
 - 용량과 속도면에서 매우 효율적
 - 주로 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용

In [16]:
tips['smoker'] = tips['smoker'].astype('str')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    object  
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(3), float64(2), int64(1), object(1)
memory usage: 8.9+ KB


In [17]:
tips['smoker'] = tips['smoker'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
