In [1]:

# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "unsupervised_learning"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [3]:
X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]

In [4]:
m, n = X.shape

S = np.zeros(X_centered.shape)
S[:n, :n] = np.diag(s)

In [5]:

np.allclose(X_centered, U.dot(S).dot(Vt))

True

In [6]:
W2 = Vt.T[:, :2]
X2D = X_centered.dot(W2)

In [7]:
X2D_using_svd = X2D

In [8]:

from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [9]:

pca.components_


array([[-0.93636116, -0.29854881, -0.18465208],
       [ 0.34027485, -0.90119108, -0.2684542 ]])

In [28]:
import numpy as np
import pandas as pd
import os
def load_dataset(filename):
    missing_values = ["n/a", "na", "--","?","",'']
    return pd.read_csv(filename, na_values = missing_values)  

traindf = load_dataset('Arrhythmia_TrainingData.csv');
testdf = load_dataset('Arrhythmia_TestingData.csv');
traindf

Unnamed: 0,39,1,164,62,79,155,367,153,95,50,...,-0.1.1,0.0.31,9.7,-0.7,0.0.32,0.8.4,1.3.2,24.1.1,33.7,1.1.1
0,38,1,160,63,79,0,376,165,0,34,...,-0.4,0.0,10.1,0.0,0.0,0.0,1.5,26.2,37.0,1
1,44,0,178,89,106,183,380,147,94,-2,...,-0.4,-0.4,8.2,-1.3,0.0,0.5,0.8,17.9,24.1,1
2,35,1,162,68,80,156,364,134,116,60,...,-0.1,-0.6,10.2,-1.6,0.0,1.3,1.0,17.0,23.6,1
3,36,0,171,76,90,156,364,170,86,69,...,0.4,-0.6,9.8,-1.3,0.0,0.6,2.7,17.4,46.0,0
4,36,1,158,58,76,136,373,152,80,74,...,0.0,0.0,6.6,-1.0,0.0,0.8,2.5,12.7,32.7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,33,1,160,69,83,174,381,125,83,69,...,-0.2,0.0,12.3,0.0,0.0,0.4,1.1,34.4,41.2,1
231,50,0,168,80,95,159,358,166,96,-50,...,-0.2,0.0,8.1,-6.5,0.0,0.6,0.8,-5.9,0.1,1
232,35,1,158,60,87,138,383,178,68,61,...,-0.2,0.0,11.2,0.0,0.0,0.4,3.5,29.1,60.6,1
233,25,1,160,56,86,162,376,184,106,25,...,-0.4,0.0,17.2,0.0,0.0,1.3,2.6,48.1,73.0,1


In [16]:
traindf.head()

Unnamed: 0,39,1,164,62,79,155,367,153,95,50,...,-0.1.1,0.0.31,9.7,-0.7,0.0.32,0.8.4,1.3.2,24.1.1,33.7,1.1.1
0,38,1,160,63,79,0,376,165,0,34,...,-0.4,0.0,10.1,0.0,0.0,0.0,1.5,26.2,37.0,1
1,44,0,178,89,106,183,380,147,94,-2,...,-0.4,-0.4,8.2,-1.3,0.0,0.5,0.8,17.9,24.1,1
2,35,1,162,68,80,156,364,134,116,60,...,-0.1,-0.6,10.2,-1.6,0.0,1.3,1.0,17.0,23.6,1
3,36,0,171,76,90,156,364,170,86,69,...,0.4,-0.6,9.8,-1.3,0.0,0.6,2.7,17.4,46.0,0
4,36,1,158,58,76,136,373,152,80,74,...,0.0,0.0,6.6,-1.0,0.0,0.8,2.5,12.7,32.7,1


In [29]:
X = traindf.iloc[:,:-1]
Xnew = testdf.iloc[:,:-1]
X

Unnamed: 0,39,1,164,62,79,155,367,153,95,50,...,37.5,-0.1.1,0.0.31,9.7,-0.7,0.0.32,0.8.4,1.3.2,24.1.1,33.7
0,38,1,160,63,79,0,376,165,0,34,...,38.2,-0.4,0.0,10.1,0.0,0.0,0.0,1.5,26.2,37.0
1,44,0,178,89,106,183,380,147,94,-2,...,14.9,-0.4,-0.4,8.2,-1.3,0.0,0.5,0.8,17.9,24.1
2,35,1,162,68,80,156,364,134,116,60,...,27.2,-0.1,-0.6,10.2,-1.6,0.0,1.3,1.0,17.0,23.6
3,36,0,171,76,90,156,364,170,86,69,...,71.1,0.4,-0.6,9.8,-1.3,0.0,0.6,2.7,17.4,46.0
4,36,1,158,58,76,136,373,152,80,74,...,38.3,0.0,0.0,6.6,-1.0,0.0,0.8,2.5,12.7,32.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,33,1,160,69,83,174,381,125,83,69,...,39.9,-0.2,0.0,12.3,0.0,0.0,0.4,1.1,34.4,41.2
231,50,0,168,80,95,159,358,166,96,-50,...,0.7,-0.2,0.0,8.1,-6.5,0.0,0.6,0.8,-5.9,0.1
232,35,1,158,60,87,138,383,178,68,61,...,88.6,-0.2,0.0,11.2,0.0,0.0,0.4,3.5,29.1,60.6
233,25,1,160,56,86,162,376,184,106,25,...,79.5,-0.4,0.0,17.2,0.0,0.0,1.3,2.6,48.1,73.0


In [30]:
y = traindf.iloc[:,-1]
ynew = testdf.iloc[:,-1]
y
ynew

0     1
1     1
2     1
3     1
4     1
5     1
6     0
7     1
8     1
9     1
10    1
11    1
12    0
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    0
21    1
22    1
23    1
24    1
25    0
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    0
37    1
38    1
39    1
40    0
41    1
42    1
43    1
44    1
45    0
46    1
47    1
48    1
49    0
50    1
51    1
52    1
53    1
54    1
55    1
56    0
57    1
Name: 1.2, dtype: int64

<h1> PCA on Arrhythmia Dataset </h1>

In [20]:

pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [21]:
pca.components_

array([[ 4.95754607e-02, -4.30253828e-04,  5.06343573e-03, ...,
        -5.66114789e-03, -1.04118232e-01, -1.60071464e-01],
       [-2.59513774e-02, -2.82899241e-04, -1.49145998e-02, ...,
        -3.44814146e-03, -6.29489392e-02, -9.69498355e-02],
       [-4.61999546e-02, -1.75536520e-03,  1.09602443e-02, ...,
         4.95782868e-03, -5.10892631e-02,  8.48350999e-03],
       ...,
       [-0.00000000e+00, -4.58075418e-16, -1.09124429e-17, ...,
        -4.54870245e-16,  1.46773869e-16,  4.33680869e-18],
       [ 0.00000000e+00,  9.62229428e-17, -8.48238735e-19, ...,
         1.73435078e-16, -6.24635977e-17,  4.77048956e-18],
       [-0.00000000e+00,  2.41587349e-16,  4.48590366e-18, ...,
        -2.90059657e-16, -6.86299975e-17,  5.63785130e-18]])

In [24]:
pca.components_.shape

(235, 235)

<h1> Logistic Regression on Arrhythmia Dataset </h1>

In [26]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:

log_reg.predict(Xnew)

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1])

In [32]:
ynew

0     1
1     1
2     1
3     1
4     1
5     1
6     0
7     1
8     1
9     1
10    1
11    1
12    0
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    0
21    1
22    1
23    1
24    1
25    0
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    1
34    1
35    1
36    0
37    1
38    1
39    1
40    0
41    1
42    1
43    1
44    1
45    0
46    1
47    1
48    1
49    0
50    1
51    1
52    1
53    1
54    1
55    1
56    0
57    1
Name: 1.2, dtype: int64

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(log_reg.predict(Xnew), ynew)

0.9482758620689655