# Data cleaning examples

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
d.shape

In [None]:
d.head()

In [None]:
d.describe()

observe:
- columns `gvh` and `aac` are not considered numerically
- missing values in `alm1`


# 1. Fix NULL values in `alm1`

check number of null valuesadd up to total number of rows

In [None]:
np.sum(d["alm1"].isnull())

In [None]:
d[d["alm1"].isnull()]

## Option 1: remove rows

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
d = d.dropna()
d.describe()

In [None]:
np.mean(d["alm1"])

## Option 2: set NaN's to specific value
- zero
- the mean of that column
- etc.

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
m = np.mean(d["alm1"])
print "mean is", m
d.loc[d["alm1"].isnull(), "alm1"]=m

In [None]:
d.loc[[13,37,58,78]]

## Option 3: draw samples from a normal distribution with mean and std of that column

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
m,s = np.mean(d["alm1"]), np.std(d["alm1"])
print "mean is", m, "stdev is", s
nb_nulls = np.sum(d["alm1"].isnull())
d.loc[d["alm1"].isnull(), "alm1"] = np.random.normal(loc=m, scale=s, size=nb_nulls)


In [None]:
d.loc[[13,37,58,78]]

# 2. Understand with `gvh` is not interpreted as numbers

     

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
for index, row in d.iterrows():
    try:
        f = float(row["gvh"])
    except:
        print index

In [None]:
d.loc[[27,78]]

decide to drop 27 and repair 78

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
d=d.drop(27)
d["gvh"] = [float(eval(i)) for i in d["gvh"]]

In [None]:
d.describe()

# 4. Similarly with `chg`


In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
for index, row in d.iterrows():
    try:
        f = float(row["chg"])
    except:
        print index

In [None]:
d.loc[[87]]

In [None]:
d=d.drop(87)

# 3. Putting all together so far

In [None]:
d = pd.read_csv("data/ecoli.data", delimiter=",")
m,s = np.mean(d["alm1"]), np.std(d["alm1"])
nb_nulls = np.sum(d["alm1"].isnull())
d.loc[d["alm1"].isnull(), "alm1"] = np.random.normal(loc=m, scale=s, size=nb_nulls)
d=d.drop([27,87])
d["gvh"] = [float(eval(i)) for i in d["gvh"]]
d["chg"] = [float(eval(i)) for i in d["chg"]]

In [None]:
d.describe()

## 4. Inspecting data

In [None]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(d, figsize=(15,15));

`chg` and `lib` seem to have outliers. let's inspect their distributions: 

In [None]:
for i in np.unique(d["chg"]):
    print "value: %5.1f"%i, ", data points:", np.sum(d["chg"]==i)

In [None]:
for i in np.unique(d["lib"]):
    print "value: %5.1f"%i, ", data points:", np.sum(d["lib"]==i)

we could now judge whether the distributions of these values are ok, or might signal some pathology in the data