## Preparing and Cleaning Data

In [3]:
# copy and paste this each time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
values = ['Alice', 'Biggles', np.nan, 'Deidre'] # this is a python list
nums = [1,2,3,4,5,np.nan]
x = pd.Series(nums)
y = pd.Series(values) # here me make a series from a list

# we can replace values directly
y[0] = None # Python has a None value to represent the absence of anything
y.notnull()
y.notna()
y.isnull() # isnull lts us see missing values
# what is the type of this series?
y
x # nan makes the type into a float

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

### Filtering

In [28]:
# common praxtice is to create a new object ot contain the results of an operation
y1 = y.dropna() # this will drop all missing values: NaN and None
y1
# by default, the original data is unchanged. but we CAN use inplace=True
y.dropna(inplace=True) # CAREFUL - the original data is lost
y # the NaN and None members are IRRETRIEVABLY removed

1    Biggles
3     Deidre
dtype: object

In [81]:
# Remember we can fill missing values
# we start with a new DataFrame
np.random.seed(0) # repeatable random values - predictable even across different platforms
values = np.random.randn(10, 3)
values  #[0,0]
# make a DataFrame from these values
df = pd.DataFrame(values)
# inject some missing values
df.iloc[:4,1] = np.nan # [start:stop-before]
df.iloc[0:2,2] = np.nan
df[1].sum()
1.7045080889679083/6 # REMEMBER mean is calculated EXCLUSING NaN cells

0.2840846814946514

In [86]:
# now we can see ways to deal with the missing values
cleaned = df.fillna(0)
# by default axis=0 (i.e. work downwards)
# for reference: axis=0 means work with columns, axis=1 meas work with rows
cleaned = df.fillna(method='ffill', axis=1, limit=1) # use previous values to fill the next values
cleaned
df.dropna() # drops the entire row if it contains NaN
df.dropna(thresh=2) # only drop a row if it has TWO NaN
df.fillna( df.mean() ) # replace NaN with the mean FOR THAT COLUMN
# alternatively we can replace with actual values
df.fillna( {1:0.5, 2:0.25} ) # remember we are NOT using inplace=True


Unnamed: 0,0,1,2
0,1.764052,0.5,0.25
1,2.240893,0.5,0.25
2,0.950088,0.5,-0.103219
3,0.410599,0.5,1.454274
4,0.761038,0.121675,0.443863
5,0.333674,1.494079,-0.205158
6,0.313068,-0.854096,-2.55299
7,0.653619,0.864436,-0.742165
8,2.269755,-1.454366,0.045759
9,-0.187184,1.532779,1.469359


In [91]:
df[2].duplicated()
df[1].drop_duplicates() # remember - this does NOT alter the original dataframe

0         NaN
4    0.121675
5    1.494079
6   -0.854096
7    0.864436
8   -1.454366
9    1.532779
Name: 1, dtype: float64

In [99]:
# replacing values
df.iloc[0,0] = 9.99 # specifically refer to a value
# we can search the ENTIRE DataFrame for EVERY matching value
df.replace(9.99, 10, inplace=True) # find ALL matching values and replace them
df

Unnamed: 0,0,1,2
0,10.0,,
1,2.240893,,
2,0.950088,,-0.103219
3,0.410599,,1.454274
4,0.761038,0.121675,0.443863
5,0.333674,1.494079,-0.205158
6,0.313068,-0.854096,-2.55299
7,0.653619,0.864436,-0.742165
8,2.269755,-1.454366,0.045759
9,-0.187184,1.532779,1.469359
