# Project 1
## B-IT Pattern Recognition



Presented on 10-Dec-2015 by:

- Abdullah Abdullah

- Can Güney Aksakallı

- Kang Cifong

- Umut Hatipoğlu

***

## Task 1.1
### Remove the outliers

### Load the data first

- We went with the approach **1** of reading multi-typed data as in `whExample.py`
    
    + We like it because it is more explicit

    + Or use `pandas`
    
- We are using **Python 3.4**, so some modifications were made for compatibility

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook



In [2]:
dt = np.dtype([('w', np.float), ('h', np.float), ('g', 'S1')])  # g is byte-string

data = np.loadtxt('data/whData.dat', dtype=dt, comments='#', delimiter=None)

ws = np.array([d[0] for d in data])
hs = np.array([d[1] for d in data])
gs = np.array([d[2].decode('utf-8') for d in data])

X = np.vstack((hs, ws, gs))  # data is going to be column-wise
# X.transpose()  # this will make it row-wise
X.shape

(3, 24)

### Raw Data
- Now, let's just plot it without modifications

    - We split the data based on gender

In [3]:
import pattrex.plotting_mpl as plt_rex
import pattrex.preprocessing as pre_rex

In [4]:
# split
X_male, X_female = pre_rex.split_data(X, True, 2, ['m', 'f'])
print("male :", X_male.shape[1], "; female :", X_female.shape[1])

# plotting
fig = plt.figure()
axs = fig.add_subplot(111)

# limits for the axes
X_ = np.vstack((hs, ws))  # only the measurements; data is col-wise
xmin, ymin = X_.min(axis=1)
xmax, ymax = X_.max(axis=1)

xlim = [xmin-5, xmax+15]  # purely for looks
ylim = [-2, ymax+10]

plt_rex.plot2d(X_male, colwise_data=True, hatch='bo', x_lim=xlim, y_lim=ylim, 
              show=False, axs=axs, set_aspect_equal=False, plotlabel="male")
plt_rex.plot2d(X_female, colwise_data=True, hatch='ro', x_lim=xlim, 
               y_lim=ylim, show=False, axs=axs, set_aspect_equal=False, 
               plotlabel="female", title="raw")

male : 17 ; female : 7


<IPython.core.display.Javascript object>

### Outliers!!

Outliers may give important insights

### Dealing with the outliers
- Here, we just ignore them

    - We keep ...
    > only those data for which both measurements are positive.
    

- We find the *unique* columns/rows for which any of the measurements are negative 

    - Then we delete the entire columns/rows that contain such measurements

        + using `numpy.delete(...)`

In [5]:
X_male_new = pre_rex.only_all_positive(X_male[0:2, :].astype(np.float), True)
X_female_new = pre_rex.only_all_positive(X_female[0:2, :].astype(np.float), True)

# plotting
fig = plt.figure(figsize=(6, 8))
axs1 = fig.add_subplot(211)
axs2 = fig.add_subplot(212)

# using the old xlim, ylim

plt_rex.plot2d(X_male, colwise_data=True, hatch='bo', x_lim=xlim, y_lim=ylim, 
               show=False, axs=axs1, plotlabel="male")
plt_rex.plot2d(X_female, colwise_data=True, hatch='ro', 
               x_lim=xlim, y_lim=ylim, show=False, axs=axs1, 
               plotlabel="female", title="raw")

plt_rex.plot2d(X_male_new, colwise_data=True, hatch='bo', x_lim=xlim, 
               y_lim=ylim, show=False, axs=axs2, plotlabel="male")
plt_rex.plot2d(X_female_new, colwise_data=True, hatch='ro', x_lim=xlim, 
               y_lim=ylim, show=False, axs=axs2, plotlabel="female",
               title="without outliers")

<IPython.core.display.Javascript object>

## Task 1.2
### Fit normal distribution to data

### Find the mean and standard Deviation of the height/weight data

- We used `numpy.mean(...)` and `numpy.std(...)`

- Then we use `scipy.stats.norm.pdf` to generate the normal distribution 

In [6]:
import pattrex.fitting as fit_rex

### Plot

In [7]:
# fit normal distribution
h_mean, h_std, h_x, h_y = fit_rex.fit_normal_distribution(hs)

w_mean, w_std, w_x, w_y = fit_rex.fit_normal_distribution(ws)

# limits for the axes, and yes we are going to cheat by using X_
hmin, wmin = X_.min(axis=1)
hmax, wmax = X_.max(axis=1)

hlim = [hmin-5, hmax+15]  # purely for looks
wlim = [wmin-5, wmax+15]

In [19]:
# plotting
fig = plt.figure(figsize=(12, 4))
axs1 = fig.add_subplot(121)
axs2 = fig.add_subplot(122)

# height
plt_rex.plot2d(np.vstack((hs, np.zeros(hs.shape))), colwise_data=True, 
               hatch='ko', x_lim=hlim, show=False, axs=axs1, 
               plotlabel="height")
plt_rex.plot2d(np.vstack((h_x, h_y)), colwise_data=True, hatch='y', 
               x_lim=hlim, show=False, axs=axs1, plotlabel="normal", 
               title=("Height (m: %.3f ; s: %.3f)")%(h_mean, h_std))
axs1.axvline(x = h_mean)

# weight
plt_rex.plot2d(np.vstack((ws, np.zeros(ws.shape))), colwise_data=True, 
               hatch='ro', x_lim=wlim, show=False, axs=axs2, 
               plotlabel="weight")
plt_rex.plot2d(np.vstack((w_x, w_y)), colwise_data=True, hatch='g', 
               x_lim=wlim, show=False, axs=axs2, plotlabel="normal", 
               title=("Weight (m: %.3f ; s: %.3f)")%(w_mean, w_std))
axs2.axvline(x = w_mean)

<IPython.core.display.Javascript object>

<matplotlib.lines.Line2D at 0x10b8dac88>

### Outliers!!

### Plot (without outliers)

In [9]:
X_new = pre_rex.only_all_positive(X_, True)
h_new = X_new[0, :]
w_new = X_new[1, :]

# fit normal distribution
h_mean_new, h_std_new, h_x_new, h_y_new = fit_rex.fit_normal_distribution(h_new)

w_mean_new, w_std_new, w_x_new, w_y_new = fit_rex.fit_normal_distribution(w_new)

In [21]:
# plotting
fig = plt.figure(figsize=(12, 8))
axs1 = fig.add_subplot(221)
axs2 = fig.add_subplot(222)
axs3 = fig.add_subplot(223)
axs4 = fig.add_subplot(224)

# limits for the axes, and yes we are going to cheat by using X_
hmin_new, wmin_new = X_new.min(axis=1)
hmax_new, wmax_new = X_new.max(axis=1)

hlim_new = [hmin_new-5, hmax_new+15]  # purely for looks
wlim_new = [wmin_new-5, wmax_new+15]

# height raw
plt_rex.plot2d(np.vstack((hs, np.zeros(hs.shape))), colwise_data=True, 
               hatch='ko', x_lim=hlim_new, show=False, axs=axs1, 
               plotlabel="height")
plt_rex.plot2d(np.vstack((h_x, h_y)), colwise_data=True, hatch='y', 
               x_lim=hlim_new, show=False, axs=axs1, plotlabel="normal", 
               title=("Height raw (m: %.3f ; s: %.3f)")%(h_mean, h_std))
axs1.axvline(x = h_mean)

# weight raw
plt_rex.plot2d(np.vstack((ws, np.zeros(ws.shape))), colwise_data=True, 
               hatch='ro', x_lim=wlim, show=False, axs=axs2, 
               plotlabel="weight")
plt_rex.plot2d(np.vstack((w_x, w_y)), colwise_data=True, hatch='g', 
               x_lim=wlim, show=False, axs=axs2, plotlabel="normal", 
               title=("Weight raw (m: %.3f ; s: %.3f)")%(w_mean, w_std))
axs2.axvline(x = w_mean)

# height
plt_rex.plot2d(np.vstack((h_new, np.zeros(h_new.shape))), colwise_data=True, 
               hatch='ko', x_lim=hlim_new, show=False, axs=axs3, 
               plotlabel="height")
plt_rex.plot2d(np.vstack((h_x_new, h_y_new)), colwise_data=True, hatch='y', 
               x_lim=hlim_new, show=False, axs=axs3, plotlabel="normal",
               title=("Height (m: %.3f ; s: %.3f)")%(h_mean_new, h_std_new))
axs3.axvline(x = h_mean_new)

# weight
plt_rex.plot2d(np.vstack((w_new, np.zeros(w_new.shape))), colwise_data=True, 
               hatch='ro', x_lim=wlim, show=False, axs=axs4, 
               plotlabel="weight")
plt_rex.plot2d(np.vstack((w_x_new, w_y_new)), colwise_data=True, hatch='g', 
               x_lim=wlim, show=False, axs=axs4, plotlabel="normal",
               title=("Weight (m: %.3f ; s: %.3f)")%(w_mean_new, w_std_new))
axs4.axvline(x = w_mean_new)

<IPython.core.display.Javascript object>

<matplotlib.lines.Line2D at 0x10c7d9978>

### Outliers


We see how the presence of outliers can mess things up for this simple model

Central Moments are not robust to outliers