# 1. Load Data

In [1]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/a10.csv', parse_dates=['date'])
df

Unnamed: 0,date,value
0,1991-07-01,3.526591
1,1991-08-01,3.180891
2,1991-09-01,3.252221
3,1991-10-01,3.611003
4,1991-11-01,3.565869
...,...,...
199,2008-02-01,21.654285
200,2008-03-01,18.264945
201,2008-04-01,23.107677
202,2008-05-01,22.912510


In [2]:
df.describe()


Unnamed: 0,value
count,204.0
mean,10.69443
std,5.956998
min,2.81452
25%,5.844095
50%,9.319345
75%,14.289964
max,29.665356


# 2. Remove arbitrary data

In [3]:
df_missing = df.copy()


In [4]:
n_del = 30

In [5]:
import numpy as np

np.random.seed(123)

In [6]:
del_set = np.random.randint(low = 0, high=np.shape(df_missing)[0], size=n_del)


In [7]:
del_set = np.unique(del_set)
del_set

array([  2,  17,  32,  39,  47,  49,  55,  57,  66,  68,  73,  78,  83,
        84,  96,  98, 106, 109, 111, 113, 123, 126, 153, 164, 174, 195])

In [8]:
for i in del_set:
  print(i,"- before:",df_missing['value'][i])
  df_missing['value'][i] = np.nan
  print(i,"- after:",df_missing['value'][i])

2 - before: 3.252221
2 - after: nan
17 - before: 5.81054917
17 - after: nan
32 - before: 4.39407557
32 - after: nan
39 - before: 5.3016513
39 - after: nan
47 - before: 5.17078711
47 - after: nan
49 - before: 5.85527729
49 - after: nan
55 - before: 5.06979585
55 - after: nan
57 - before: 5.59712628
57 - after: nan
66 - before: 8.52447101
66 - after: nan
68 - before: 5.71430345
68 - after: nan
73 - before: 6.70491861
73 - after: nan
78 - before: 8.79851303
78 - after: nan
83 - before: 7.38338118
83 - after: nan
84 - before: 7.81349587
84 - after: nan
96 - before: 8.71742046
96 - after: nan
98 - before: 9.17711337
98 - after: nan
106 - before: 9.3868026
106 - after: nan
109 - before: 10.64375083
109 - after: nan
111 - before: 11.7100413
111 - after: nan
113 - before: 12.07913184
113 - after: nan
123 - before: 12.65213444
123 - after: nan
126 - before: 16.30026927
126 - after: nan
153 - before: 12.88264507
153 - after: nan
164 - before: 13.402392
164 - after: nan
174 - before: 23.486694
17

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
n_nan = np.isnan(df_missing['value']).sum()
print('The number of Nan is ', n_nan)

The number of Nan is  26


In [10]:
len(del_set)

26

# 3. LOCF: Last Observation Carried Forward

In [12]:
df_locf = df_missing.copy()

In [13]:
for i in range(np.shape(df_locf)[0]):
  if np.isnan(df_locf['value'][i]):
    try:
      df_locf['value'][i] = df_locf['value'][i-1]
      print('NaN Cell: ',df_locf['value'][i], '-- Previous Cell',df_locf['value'][i-1])
    except: #예외규칙: 예외는 'i=0'일 때를 의미
      df_locf['value'][i] = 0
      print('NaN Cell: ',df_locf['value'][i], '-- No Previous Cell')

NaN Cell:  3.180891 -- Previous Cell 3.180891
NaN Cell:  4.38653092 -- Previous Cell 4.38653092
NaN Cell:  3.84127758 -- Previous Cell 3.84127758
NaN Cell:  5.20445484 -- Previous Cell 5.20445484
NaN Cell:  5.19475419 -- Previous Cell 5.19475419
NaN Cell:  5.25674157 -- Previous Cell 5.25674157
NaN Cell:  8.32945212 -- Previous Cell 8.32945212
NaN Cell:  5.26255667 -- Previous Cell 5.26255667
NaN Cell:  8.60693721 -- Previous Cell 8.60693721
NaN Cell:  5.27791837 -- Previous Cell 5.27791837
NaN Cell:  7.05083102 -- Previous Cell 7.05083102
NaN Cell:  10.09623339 -- Previous Cell 10.09623339
NaN Cell:  7.06420058 -- Previous Cell 7.06420058
NaN Cell:  7.06420058 -- Previous Cell 7.06420058
NaN Cell:  8.16532298 -- Previous Cell 8.16532298
NaN Cell:  9.07096378 -- Previous Cell 9.07096378
NaN Cell:  8.47400037 -- Previous Cell 8.47400037
NaN Cell:  10.8342948 -- Previous Cell 10.8342948
NaN Cell:  9.90816186 -- Previous Cell 9.90816186
NaN Cell:  11.34015074 -- Previous Cell 11.34015074


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# 4. NOCB: Next Observation Carried Backward

In [14]:
df_nocb = df_missing.copy()

In [15]:
for i in range(np.shape(df_nocb)[0]):
  if np.isnan(df_nocb['value'][i]):
    try:
      df_nocb['value'][i] = df_nocb['value'][i+1]
      print('NaN cell: ', df_nocb['value'][i], ' -- Next Cell: ',df_nocb['value'][i+1])
    except:
      df_nocb['value'][i] = 0 # 예외규칙
      print('NaN cell: ', df_nocb['value'][i],'is last cell')

NaN cell:  3.611003  -- Next Cell:  3.611003
NaN cell:  6.19206769  -- Next Cell:  6.19206769
NaN cell:  4.07534073  -- Next Cell:  4.07534073
NaN cell:  5.77374216  -- Next Cell:  5.77374216
NaN cell:  5.25674157  -- Next Cell:  5.25674157
NaN cell:  5.49072901  -- Next Cell:  5.49072901
NaN cell:  5.26255667  -- Next Cell:  5.26255667
NaN cell:  6.110296  -- Next Cell:  6.110296
NaN cell:  5.27791837  -- Next Cell:  5.27791837
NaN cell:  6.21452908  -- Next Cell:  6.21452908
NaN cell:  7.25098761  -- Next Cell:  7.25098761
NaN cell:  5.91826076  -- Next Cell:  5.91826076
NaN cell:  nan  -- Next Cell:  nan
NaN cell:  7.43189221  -- Next Cell:  7.43189221
NaN cell:  9.07096378  -- Next Cell:  9.07096378
NaN cell:  9.25188674  -- Next Cell:  9.25188674
NaN cell:  9.56039945  -- Next Cell:  9.56039945
NaN cell:  9.90816186  -- Next Cell:  9.90816186
NaN cell:  11.34015074  -- Next Cell:  11.34015074
NaN cell:  14.49758109  -- Next Cell:  14.49758109
NaN cell:  13.67446631  -- Next Cell: 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# 5. Linear Interpolation

In [20]:
df_linear = df_missing.copy()

Unnamed: 0,date,value
0,1991-07-01,3.526591
1,1991-08-01,3.180891
2,1991-09-01,
3,1991-10-01,3.611003
4,1991-11-01,3.565869
...,...,...
199,2008-02-01,21.654285
200,2008-03-01,18.264945
201,2008-04-01,23.107677
202,2008-05-01,22.912510


In [22]:
for i in range(np.shape(df_linear)[0]):
  if np.isnan(df_linear['value'][i]):

    try: #첫번째와 마지막에 위치하지 않은 i
      if (np.isnan(df_linear['value'][i-1])==False) & (np.isnan(df_linear['value'][i+1])==False):
        df_linear['value'][i] = (df_linear['value'][i-1] + df_linear['value'][i+1])/2
        
      else: 
        if np.isnan(df_linear['value'][i-1]):
          df_linear['value'][i] = df_linear['value'][i+1]
        else: # np.isnan(df_locf['value'][i+1])
          df_linear['value'][i] = df_linear['value'][i-1]

    except: #첫번째와 마지막에 위치한 i
      if i-1 < 0:
        df_linear['value'][i] = df_linear['value'][i+1] # 첫번째일 경우
      else: # i-1 > 0
        df_linear['value'][i] = df_linear['value'][i-1] # 마지막일 경우

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [23]:
df_linear

Unnamed: 0,date,value
0,1991-07-01,3.526591
1,1991-08-01,3.180891
2,1991-09-01,3.395947
3,1991-10-01,3.611003
4,1991-11-01,3.565869
...,...,...
199,2008-02-01,21.654285
200,2008-03-01,18.264945
201,2008-04-01,23.107677
202,2008-05-01,22.912510
