In [2]:
# 누락데이터 처리
import pandas as pd
import seaborn as sns

In [5]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [13]:
# deck 칼럼의 값들의 갯수를 세어봄  df.value_counts()
print(df['deck'].value_counts(), '\n')
deck_cnt = df['deck'].value_counts(dropna=False)

print(deck_cnt)

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: deck, dtype: int64 

NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64


In [15]:
# isnull() 메서드로 누락된 데이터 찾기    null 이면 True, 값이 있으면 False 반환
print(df['deck'].head().isnull(), '\n')

0       True
1      False
2       True
3      False
4       True
       ...  
886     True
887    False
888     True
889    False
890     True
Name: deck, Length: 891, dtype: bool 



In [17]:
# isnull()과 sum()함수를 활용하여 누락된 데이터 갯수 찾기
print(df['deck'].isnull().sum(), '\n')
df.isnull().sum(axis=0)

688 



survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [20]:
# 반복문으로 Nan 데이터 개수 계산하기

nan_data = df.isnull()

for i in nan_data.columns:
    nan_cnt = nan_data[i].value_counts()
    
    try:
        print(i, ':', nan_cnt[True]) # Nan 값이 존재하면 개수 출력
    except:
        print(i, ':', 0)             # Nan 값이 존재하지 않으면 0

survived : 0
pclass : 0
sex : 0
age : 177
sibsp : 0
parch : 0
fare : 0
embarked : 2
class : 0
who : 0
adult_male : 0
deck : 688
embark_town : 2
alive : 0
alone : 0


In [56]:
# Nan 데이터 확인 -> 어떻게 처리할건지?
# Nan 데이터가 있는 칼럼을 삭제? Nan 데이터가 있는 행을 삭제?

# df.dropna(thresh=500) : Nan이 존재하는 컬럼을 삭제, 갯수가 500 이상인 컬럼만 삭제
df_thresh = df.dropna(axis = 1, thresh=500)
# df_thresh.info()

# age에 Nan이 존재하는 행을 삭제
df_age = df.dropna(subset=['age'], how='any', axis=0)

# 컬럼 리스트 : Nan이 존재하는 컬럼 리스트
nan_col = [df.isnull().sum() > 0]

# Nan이 존재하는 모든 행을 삭제
nan_col_names = list(df.columns[nan_col])
nan_col_names

df_nan = df.dropna(subset=nan_col_names, how='any', axis=0)
df_nan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 1 to 889
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     201 non-null    int64   
 1   pclass       201 non-null    int64   
 2   sex          201 non-null    object  
 3   age          201 non-null    float64 
 4   sibsp        201 non-null    int64   
 5   parch        201 non-null    int64   
 6   fare         201 non-null    float64 
 7   embarked     201 non-null    object  
 8   class        201 non-null    category
 9   who          201 non-null    object  
 10  adult_male   201 non-null    bool    
 11  deck         201 non-null    category
 12  embark_town  201 non-null    object  
 13  alive        201 non-null    object  
 14  alone        201 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 20.1+ KB


  result = getitem(key)


In [57]:
# 누락 데이터 치환 : df.fillna(값 또는 method=ffill or bfill)
df[df.columns].isnull().sum()

# Nan이 입력된 age를 평균 나이로 치환
df_age = df.copy()
df_age['age'].fillna(df_age['age'].mean(axis=0), inplace=True)
df_age.isnull().sum()
print(df['age'].head(10), df_age['age'].head(10))

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64 0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64


In [65]:
# 누락 데이터 치환
# embark_town 825~831 행 출력
df.embark_town[825:832]
df['embark_town'].value_counts().idxmax() # 가장 빈번하게 발생하는 인덱스명

df_em = df.copy()
df_em['embark_town'].fillna(df_em['embark_town'].value_counts().idxmax(), inplace=True)
df_em.embark_town[825:832]

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
830      Cherbourg
831    Southampton
Name: embark_town, dtype: object

In [70]:
# 누락 데이터 치환 : 이전 데이터로 치환
df_me = df.copy()
df_me['embark_town'].fillna(method='ffill', inplace=True)

print(df_me.embark_town[[828,829]], '\n', df.embark_town[[828,829]])

828    Queenstown
829    Queenstown
Name: embark_town, dtype: object 
 828    Queenstown
829           NaN
Name: embark_town, dtype: object


In [82]:
# 중복 데이터 처리 : df.duplicated() -> 중복 여부 확인
df1 =pd.DataFrame({'c1':['a', 'a', 'b', 'a', 'b'],
                  'c2':[1,1,1,2,2],
                  'c3':[1,1,2,2,2]
                  })

df_dup = df1.duplicated()
col_dup = df1['c1'].duplicated()

# 중복된 데이터 제거, 행을 제거
df2 = df1.drop_duplicates()

# 중복된 데이터 제거, 컬럼을 제거
df3 = df1.drop_duplicates(subset=['c2','c3'])

print(df1)
df2

  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


Unnamed: 0,c1,c2,c3
0,a,1,1
2,b,1,2
3,a,2,2
4,b,2,2


In [90]:
# titanic에서 'age', 'fare', 'class', 'alive' 컬럼을 가져와 df_titanic 으로 지정 후
# Nan 있는 컬럼의 값의 숫자는 평균으로 문자는 이전 값으로 대체
# 중복된 행과 컬럼은 삭제하세요

df = sns.load_dataset('titanic')
col = ['age', 'fare', 'class', 'alive' ]
df_titanic = df[col]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   age     714 non-null    float64 
 1   fare    891 non-null    float64 
 2   class   891 non-null    category
 3   alive   891 non-null    object  
dtypes: category(1), float64(2), object(1)
memory usage: 22.0+ KB


In [91]:
# 중복된 행과 컬럼은 삭제
df_titanic.drop_duplicates(inplace=True)
len(df_titanic)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


746

In [93]:
df_titanic.drop_duplicates(subset=['age'],inplace=True)
len(df_titanic)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


89

In [95]:
# 데이터 표준화
df = pd.read_csv('./dataset/auto-mpg.csv', header=None)

df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name']
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [98]:
# 단위 환산 : mpg -> gallon => kg로 변환
mpg_to_kg = 1.60934 / 3.78541
df['kpl'] = (df['mpg'] * mpg_to_kg).round(2)

In [103]:
import numpy as np

## 자료형 반환   object -> float
df['horsepower'].unique() # '?' 발견

# ? 를 nan으로 변경
df['horsepower'].replace('?', np.nan, inplace=True)

# 데이터 타입을 float으로 변경
df['horsepower'] = df['horsepower'].astype('float')

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [107]:
## 숫자를 -> category로 변경 -> 나라 이름으로 변경
# 숫자를 나라 이름으로 변경
df.origin.unique()  
df.origin.replace({1:'usa', 2:'eu', 3:'jp'}, inplace=True)
df.origin.unique()

# 나라 이름을 category로 변경
df.origin = df.origin.astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    392 non-null    float64 
 4   weight        398 non-null    float64 
 5   acceleration  398 non-null    float64 
 6   model year    398 non-null    int64   
 7   origin        398 non-null    category
 8   name          398 non-null    object  
 9   kpl           398 non-null    float64 
dtypes: category(1), float64(6), int64(2), object(1)
memory usage: 28.6+ KB


In [117]:
# 제조 년도 :model year -> 카테고리로 변경
df['model year'].unique()

# 숫자를 문자로, 문자를 카테고리로 변경
df['model year'] = df['model year'].map(str).astype('category')

df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   mpg           398 non-null    float64 
 1   cylinders     398 non-null    int64   
 2   displacement  398 non-null    float64 
 3   horsepower    392 non-null    float64 
 4   weight        398 non-null    float64 
 5   acceleration  398 non-null    float64 
 6   model year    398 non-null    category
 7   origin        398 non-null    category
 8   name          398 non-null    object  
 9   kpl           398 non-null    float64 
dtypes: category(2), float64(6), int64(1), object(1)
memory usage: 26.6+ KB


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,kpl
0,18.0,8,307.0,130.0,3504.0,12.0,70,usa,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165.0,3693.0,11.5,70,usa,buick skylark 320,6.38
2,18.0,8,318.0,150.0,3436.0,11.0,70,usa,plymouth satellite,7.65


7