In [19]:
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

print(tips.shape)
print(tips.columns)

(244, 7)
Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')


In [20]:
tips['smoker_str'] = tips['smoker'].astype(str)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [22]:
tips['total_bill'] = tips['total_bill'].astype(str)
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [23]:
tips['total_bill'] = tips['total_bill'].astype(float)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [27]:
#잘못 입력한 데이터 처리하기
#숫자형태의 데이터에 문자열을 입력하면 object형태로 자동 변경됨
#astype()으로 숫자 변환 시에는 문자 데이터 처리가 불가능함

#잘못 입력한 데이터 처리하기(to_numeric)
#to_numeric 메소드를 사용해도 비슷한 오류가 발생하지만
#errors옵션으로 'raise', 'coerce', 'ignore'를 지정하여 제어 가능
#raise: 숫자로 변활할 수 없는 값이 있으면 오류 발생(기본값)
#coerce: 숫자로 변환할 수 없는 값을 누락값으로 지정
#ignore: 아무 작업도 하지 않음

In [28]:
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7],'total_bill'] = 'missing'

print(tips_sub_miss.dtypes)
print(tips_sub_miss)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object
  total_bill   tip     sex smoker  day    time  size smoker_str
0      16.99  1.01  Female     No  Sun  Dinner     2         No
1    missing  1.66    Male     No  Sun  Dinner     3         No
2      21.01  3.50    Male     No  Sun  Dinner     3         No
3    missing  3.31    Male     No  Sun  Dinner     2         No
4      24.59  3.61  Female     No  Sun  Dinner     4         No
5    missing  4.71    Male     No  Sun  Dinner     4         No
6       8.77  2.00    Male     No  Sun  Dinner     2         No
7    missing  3.12    Male     No  Sun  Dinner     4         No
8      15.04  1.96    Male     No  Sun  Dinner     2         No
9      14.78  3.23    Male     No  Sun  Dinner     2         No


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tips_sub_miss.loc[[1,3,5,7],'total_bill'] = 'missing'


In [30]:
#잘못된 값을 누락값으로 변환
pd.to_numeric(tips_sub_miss['total_bill'], errors = 'coerce')

0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float64

In [31]:
#잘못된 값 무시
pd.to_numeric(tips_sub_miss['total_bill'], errors = 'ignore')

0      16.99
1    missing
2      21.01
3    missing
4      24.59
5    missing
6       8.77
7    missing
8      15.04
9      14.78
Name: total_bill, dtype: object

In [32]:
#카테고리 자료형의 장점과 특징
# 용량과 속도 면에서 매우 효율적
# 주로 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용
tips['smoker'] = tips['smoker'].astype('str')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    object  
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   smoker_str  244 non-null    object  
dtypes: category(3), float64(2), int64(1), object(2)
memory usage: 10.8+ KB


In [33]:
tips['smoker'] = tips['smoker'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   smoker_str  244 non-null    object  
dtypes: category(4), float64(2), int64(1), object(1)
memory usage: 9.3+ KB
