In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
boston_house_prices_data = load_boston()
bhpd = boston_house_prices_data

In [2]:
print(bhpd.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

#  CLEANING DATA

1) REMOVING ROWS WITH MISSING DATA
2) FILLING IN DATA WITH AVERAGES
3) OTHER DATA IMPUTATION STRATEGIES

In [3]:
num_rooms = pd.Series([1,np.nan, 2, 3, 1, None])
num_rooms.isnull()

0    False
1     True
2    False
3    False
4    False
5     True
dtype: bool

In [4]:
num_rooms[num_rooms.notnull()]

0    1.0
2    2.0
3    3.0
4    1.0
dtype: float64

In [5]:
df = pd.DataFrame([[1, np.nan, 2],
                [2, 300, 5],
                [1, np.nan, np.nan]])
df

Unnamed: 0,0,1,2
0,1,,2.0
1,2,300.0,5.0
2,1,,


In [6]:
df.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,False,False,False
2,False,True,True


In [7]:
df.dropna()

Unnamed: 0,0,1,2
1,2,300.0,5.0


In [8]:
means = df.mean(axis=0)
means

0      1.333333
1    300.000000
2      3.500000
dtype: float64

In [9]:
df.fillna(means)

Unnamed: 0,0,1,2
0,1,300.0,2.0
1,2,300.0,5.0
2,1,300.0,3.5


# Standardization and Normalization to Deal with Variables with Different Scales

In [10]:
data = [
    [0, 0], [0, 0],
    [1, 1], [1, 1]
]
data

[[0, 0], [0, 0], [1, 1], [1, 1]]

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
StandardScaler().fit_transform(data)

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  1.],
       [ 1.,  1.]])

In [13]:
from sklearn import preprocessing
preprocessing.normalize(data, norm='l2')

array([[0.        , 0.        ],
       [0.        , 0.        ],
       [0.70710678, 0.70710678],
       [0.70710678, 0.70710678]])

# ELIMINATING DUPLICATE ENTRIES

In [14]:
x = pd.DataFrame(bhpd.data)
x[x.duplicated()]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12


In [17]:
x = x.append(x.iloc[0,:], ignore_index=True)

In [19]:
x.shape

(507, 13)

In [20]:
x[x.duplicated()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
506,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98


In [21]:
x = x.drop_duplicates()

In [22]:
x[x.duplicated()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12


# Section 2: how do we learn rules to classify objects

1) What is the Iris dataset
2) How to get the Iris dataset
3) Classifying Irises by hand