# Missing Data 처리 유형 1 : Missing Data를 포함하는 행은 제거
### Missing Data 비율 56%인 `depart`
### Missing Data 비율 47%인 `sunrise & sunset` 모두 제외


In [1]:
import pandas as pd
import numpy as np

## Weather Data : Identify Missing Data

In [2]:
weather = pd.read_csv("../data/weather.csv")

In [3]:
print(weather.columns)
print(weather.shape)
weather.tail()

Index(['station_nbr', 'date', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'snowfall',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed'],
      dtype='object')
(20517, 20)


Unnamed: 0,station_nbr,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
20512,16,2014-10-31,53,34,44,M,35,41,21,0,-,-,,M,0.0,29.9,29.99,4.5,3,5.3
20513,17,2014-10-31,59,34,47,M,32,41,18,0,-,-,RA,0.0,0.0,29.72,30.39,9.2,1,9.3
20514,18,2014-10-31,67,49,58,-4,40,50,7,0,0644,1738,,0.0,0.0,29.78,30.28,10.6,36,11.2
20515,19,2014-10-31,45,33,39,-6,24,32,26,0,0624,1646,RA SN,0.1,0.02,29.51,30.24,20.4,34,20.9
20516,20,2014-10-31,68,50,59,M,39,50,6,0,-,-,,0.0,0.0,29.57,30.27,10.5,36,11.2


`depart, sunrise, sunset` 제외

In [4]:
weather = pd.DataFrame(weather, columns=['station_nbr', 'tmax', 'tmin', 'tavg', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'codesum', 'snowfall',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed'])

In [5]:
weather.head()

Unnamed: 0,station_nbr,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,52,31,42,36,40,23,0,RA FZFG BR,M,0.05,29.78,29.92,3.6,20,4.6
1,2,48,33,41,37,39,24,0,RA,0.0,0.07,28.82,29.91,9.1,23,11.3
2,3,55,34,45,24,36,20,0,,0.0,0.0,29.77,30.47,9.9,31,10.0
3,4,63,47,55,28,43,10,0,,0.0,0.0,29.79,30.48,8.0,35,8.2
4,6,63,34,49,31,43,16,0,,0.0,0.0,29.95,30.47,14.0,36,13.8


### Missing Data 유형 (`M`, `T`, `-`, `공백`, `공백 T`)

In [6]:
for i in weather.columns:
    print(i, "\n", weather[i].unique())

station_nbr 
 [ 1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20  5]
tmax 
 ['52' '48' '55' '63' '50' '66' '34' '73' '72' '38' '56' '59' '46' '45' '60'
 '44' '21' '65' '43' '53' '25' '32' '28' '62' '61' '47' '30' '26' '71' '58'
 '39' '54' '23' '27' '31' '68' '70' '67' '36' '69' '74' '75' '33' '51' '57'
 '41' '49' '76' '35' '77' '64' '40' '16' '37' '42' '14' '22' '13' '10' '20'
 '24' '80' '4' '11' '78' '17' '18' '79' '29' '81' '82' '83' '88' '87' '92'
 '89' '84' '85' '86' '90' '91' '94' '93' '96' '95' 'M' '97' '98' '101' '99'
 '100' '105' '103' '102' '106' '104' '109' '107' '108' '110' '111' '112'
 '114' '113' '19' '15' '12' '-1' '5' '8' '9' '1' '6' '2' '3' '7' '-2' '-11'
 '-4' '-6' '0']
tmin 
 ['31' '33' '34' '47' '45' '19' '53' '48' '18' '26' '30' '38' '40' '25' '41'
 '28' '24' '37' '36' '11' '16' '29' '22' '35' '15' '9' '21' '27' '10' '5'
 '32' '20' '3' '8' '44' '46' '17' '55' '43' '57' '50' '49' '54' '42' '39'
 '52' '23' '56' '14' '59' '-1' '12' '6' '4' '2' '60' '51' '63' '-5

### codesum의 모든 빈칸은 moderate 날씨

In [7]:
def codesum_character(codesum):
    if codesum == ' ':
        return "moderate"
    else:
        return codesum

In [8]:
weather['codesum'] = weather['codesum'].apply(codesum_character)

In [9]:
for i in weather.columns:
    weather[i] = weather[i].replace('-', np.nan)
    weather[i] = weather[i].replace(' ', np.nan)
    weather[i] = weather[i].replace('M', np.nan)
    weather[i] = weather[i].replace('T', np.nan)
    weather[i] = weather[i].replace('  T', np.nan)

In [10]:
for i in weather.columns:
    print(i, "\n", weather[i].unique())

station_nbr 
 [ 1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20  5]
tmax 
 ['52' '48' '55' '63' '50' '66' '34' '73' '72' '38' '56' '59' '46' '45' '60'
 '44' '21' '65' '43' '53' '25' '32' '28' '62' '61' '47' '30' '26' '71' '58'
 '39' '54' '23' '27' '31' '68' '70' '67' '36' '69' '74' '75' '33' '51' '57'
 '41' '49' '76' '35' '77' '64' '40' '16' '37' '42' '14' '22' '13' '10' '20'
 '24' '80' '4' '11' '78' '17' '18' '79' '29' '81' '82' '83' '88' '87' '92'
 '89' '84' '85' '86' '90' '91' '94' '93' '96' '95' nan '97' '98' '101' '99'
 '100' '105' '103' '102' '106' '104' '109' '107' '108' '110' '111' '112'
 '114' '113' '19' '15' '12' '-1' '5' '8' '9' '1' '6' '2' '3' '7' '-2' '-11'
 '-4' '-6' '0']
tmin 
 ['31' '33' '34' '47' '45' '19' '53' '48' '18' '26' '30' '38' '40' '25' '41'
 '28' '24' '37' '36' '11' '16' '29' '22' '35' '15' '9' '21' '27' '10' '5'
 '32' '20' '3' '8' '44' '46' '17' '55' '43' '57' '50' '49' '54' '42' '39'
 '52' '23' '56' '14' '59' '-1' '12' '6' '4' '2' '60' '51' '63' '-5

### all to float64

In [11]:
for col in weather[['tmax','tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat', 'cool', 
                 'snowfall', 'preciptotal', 'stnpressure','sealevel',
                 'resultspeed', 'resultdir', 'avgspeed']]:
    weather[col] = weather[col].astype(np.float64)

In [12]:
print(weather.columns)
print(weather.shape)
weather.head()

Index(['station_nbr', 'tmax', 'tmin', 'tavg', 'dewpoint', 'wetbulb', 'heat',
       'cool', 'codesum', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed'],
      dtype='object')
(20517, 16)


Unnamed: 0,station_nbr,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,52.0,31.0,42.0,36.0,40.0,23.0,0.0,RA FZFG BR,,0.05,29.78,29.92,3.6,20.0,4.6
1,2,48.0,33.0,41.0,37.0,39.0,24.0,0.0,RA,0.0,0.07,28.82,29.91,9.1,23.0,11.3
2,3,55.0,34.0,45.0,24.0,36.0,20.0,0.0,moderate,0.0,0.0,29.77,30.47,9.9,31.0,10.0
3,4,63.0,47.0,55.0,28.0,43.0,10.0,0.0,moderate,0.0,0.0,29.79,30.48,8.0,35.0,8.2
4,6,63.0,34.0,49.0,31.0,43.0,16.0,0.0,moderate,0.0,0.0,29.95,30.47,14.0,36.0,13.8


### np.nan 제거

In [13]:
weather_2 = weather

In [14]:
print(weather_2.shape)
weather_2.head()

(20517, 16)


Unnamed: 0,station_nbr,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,52.0,31.0,42.0,36.0,40.0,23.0,0.0,RA FZFG BR,,0.05,29.78,29.92,3.6,20.0,4.6
1,2,48.0,33.0,41.0,37.0,39.0,24.0,0.0,RA,0.0,0.07,28.82,29.91,9.1,23.0,11.3
2,3,55.0,34.0,45.0,24.0,36.0,20.0,0.0,moderate,0.0,0.0,29.77,30.47,9.9,31.0,10.0
3,4,63.0,47.0,55.0,28.0,43.0,10.0,0.0,moderate,0.0,0.0,29.79,30.48,8.0,35.0,8.2
4,6,63.0,34.0,49.0,31.0,43.0,16.0,0.0,moderate,0.0,0.0,29.95,30.47,14.0,36.0,13.8


In [15]:
weather_2 = weather_2.dropna(how='any')
weather_2.head()

Unnamed: 0,station_nbr,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
1,2,48.0,33.0,41.0,37.0,39.0,24.0,0.0,RA,0.0,0.07,28.82,29.91,9.1,23.0,11.3
2,3,55.0,34.0,45.0,24.0,36.0,20.0,0.0,moderate,0.0,0.0,29.77,30.47,9.9,31.0,10.0
3,4,63.0,47.0,55.0,28.0,43.0,10.0,0.0,moderate,0.0,0.0,29.79,30.48,8.0,35.0,8.2
4,6,63.0,34.0,49.0,31.0,43.0,16.0,0.0,moderate,0.0,0.0,29.95,30.47,14.0,36.0,13.8
5,7,50.0,33.0,42.0,26.0,35.0,23.0,0.0,moderate,0.0,0.0,29.15,30.54,10.3,32.0,10.2


# 결과

행 9173개 제거

In [16]:
len(weather)-len(weather_2)

9173

In [17]:
print(weather_2.shape)
weather_2.head()

(11344, 16)


Unnamed: 0,station_nbr,tmax,tmin,tavg,dewpoint,wetbulb,heat,cool,codesum,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
1,2,48.0,33.0,41.0,37.0,39.0,24.0,0.0,RA,0.0,0.07,28.82,29.91,9.1,23.0,11.3
2,3,55.0,34.0,45.0,24.0,36.0,20.0,0.0,moderate,0.0,0.0,29.77,30.47,9.9,31.0,10.0
3,4,63.0,47.0,55.0,28.0,43.0,10.0,0.0,moderate,0.0,0.0,29.79,30.48,8.0,35.0,8.2
4,6,63.0,34.0,49.0,31.0,43.0,16.0,0.0,moderate,0.0,0.0,29.95,30.47,14.0,36.0,13.8
5,7,50.0,33.0,42.0,26.0,35.0,23.0,0.0,moderate,0.0,0.0,29.15,30.54,10.3,32.0,10.2


In [18]:
weather_2.to_csv("weather_ver_4")