## Principal Component Identification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
sns.set_style()

In [None]:
rnd_num = np.random.RandomState(42)

In [None]:
a = rnd_num.rand(2,2)

In [None]:
b=rnd_num.randn(2,500)

In [None]:
b

In [None]:
X = np.dot(a,b).T

In [None]:
X[:,0] = - X[:,0]

In [None]:
plt.scatter(X[:,0], X[:,1]);
plt.axis('equal');

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
pca.fit(X)

In [None]:
print(pca.components_)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.scatter(X[:,0], X[:,1], alpha=0.3)

for k, v in zip(pca.explained_variance_, pca.components_):
    vec = v * 3 * np.sqrt(k)
    ax = plt.gca()
    arrowprops=dict(arrowstyle='<-', linewidth=1, shrinkA=0, shrinkB=0)
    ax.annotate('', pca.mean_, pca.mean_ + vec, arrowprops=arrowprops)
    ax.text(-0.90, 1.2,'PC1',ha='center',va='center', rotation=-42, size=12)
    ax.text(-0.1, -0.6,'PC2',ha='center',va='center', rotation=50, size=12)

plt.axis('equal')

## Dimensionality Reduction with PCA

In [None]:
pca=PCA(n_components=1)
pca.fit(X)
X_pca=pca.transform(X)

In [None]:
X.shape

In [None]:
X_pca.shape

In [None]:
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:,0], X[:,1], alpha=0.2);
plt.scatter(X_new[:,0], X_new[:,1], alpha=0.8);
plt.axis('equal');

In [None]:
X_pca

# Project Wine

In [None]:
df = pd.read_csv("../wine/wine.data.csv", header=None)

In [None]:
# Class, Alcohol, Malic Acid, Ash, Alcalinity of ash, Magnesium, Total phenols, Flavanoids, Nonflavanoid phenols, Proanthocyanins, Color intensity, Hue, OD280/OD315 of diluted wines, Proline  


In [None]:
df.head()

In [None]:
df.columns = ['Class', 'Alcohol', 'Malic Acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

In [None]:
X = df.iloc[:,1:]

In [None]:
y = df['Class']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
sc.fit(X_train)
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
pca = PCA(n_components=None)

In [None]:
pca.fit_transform(X_test_sc)

In [None]:
pca.explained_variance_ratio_

In [None]:
pd.DataFrame(np.round(pca.components_,3), columns=X.columns)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

# Abalone

In [None]:
df = pd.read_csv("../abalone/abalone.data", header=None)

In [None]:
df.head()

In [None]:
df.columns=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

In [None]:
df['Sex'].values

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
lencoded = df['Sex'].values
le = LabelEncoder()
iencoded = iencoded.reshape(len(iencoded), 1)
ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
src = iencoded
src = src.reshape(len(src),1)
oencoded = ohe.fit_transform(src)
edf = pd.DataFrame(oencoded)
edf.columns = ['F','I','M']


In [None]:
tmp = pd.concat([edf, df],axis=1)

In [None]:
tmp = tmp.drop(columns=['Sex'])
tmp

In [None]:
X = df1.iloc[:, 1:]

In [None]:
X_sc = sc.fit_transform(X)

In [None]:
pca = PCA(n_components=None)

In [None]:
pca.fit_transform(X_sc)

In [None]:
pca.explained_variance_ratio_

In [None]:
pd.DataFrame(np.round(pca.components_,3), columns=X.columns)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
res = pca.transform(X_train_sc)

In [None]:
index_name =['PCA_'+str(k) for k in range(0,len(res))]

In [None]:
df1 = pd.DataFrame(res, columns=X.columns, index=index_name)[0:4]

In [None]:
df1.T.sort_values(by='PCA_0')

## Abalone 

In [None]:
df = pd.read_csv("../abalone/abalone.data", header=None)

In [None]:
df.columns=['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

In [None]:
X = df.iloc[:, 1:]

In [None]:
X_sc = sc.fit_transform(X)

In [None]:
pca = PCA(n_components=None)

In [None]:
pca.fit_transform(X_sc)

In [None]:
pca.explained_variance_ratio_


In [None]:
pd.DataFrame(np.round(pca.components_,3), columns=X.columns)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

# Kerrnel PCA

In [None]:
from sklearn.datasets import make_circles

In [None]:
from sklearn.decomposition import PCA, KernelPCA

In [None]:
np.random.seed(0)

In [None]:
X, y = make_circles(n_samples=400, factor=0.3, noise=0.05)

In [None]:
X

In [None]:
plt.scatter(X[:,0], X[:,1])

In [None]:
kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)

In [None]:
X_kpca = kpca.fit_transform(X)
X_back = kpca.inverse_transform(X_kpca)
pca=PCA()
X_pca=pca.fit_transform(X)

In [None]:
y

In [None]:
plt.scatter?

In [None]:
plt.figure()
plt.title("Original Space")
reds = y == 0
blues = y == 1
plt.xlabel="$x_1$"
plt.ylabel="$x_2$"
plt.scatter(X[reds,0], X[reds,1], c="red", s=20, edgecolors="k")
plt.scatter(X[blues,0], X[blues,1], c="blue", s=20, edgecolors="k")


In [None]:
plt.figure()
plt.title("Projectoin by PCA")
plt.xlabel="1st Component"
plt.ylabel="2nd Component"
plt.scatter(X_pca[reds,0], X_pca[reds,1], c="red", s=20, edgecolors="k")
plt.scatter(X_pca[blues,0], X_pca[blues,1], c="blue", s=20, edgecolors="k")

In [None]:
plt.figure()
plt.title("Projectoin by KPCA")
plt.xlabel="1st Component"
plt.ylabel="2nd Component"
plt.scatter(X_kpca[reds,0], X_kpca[reds,1], c="red", s=20, edgecolors="k")
plt.scatter(X_kpca[blues,0], X_kpca[blues,1], c="blue", s=20, edgecolors="k")

# Kernel PCA IRIS

In [None]:
df = sns.load_dataset('iris')
df = df[df['species'] != 'setosa']

In [None]:
col = ['petal_length', 'petal_width']
X = df.loc[:, col]
species_to_num = {'versicolor': 0, 'virginica': 1}
df['tmp'] = df['species'].map(species_to_num)
y = df['tmp']

In [None]:
kpca = KernelPCA(kernel='rbf', n_components=2)
X_kpca = kpca.fit_transform(X)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression()
clf.fit(X,y)

In [None]:
Xv = X.values.reshape(-1,1)
h = 0.02
x_min, x_max = Xv.min(), Xv.max() + 1
y_min, y_max = y.min(), y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

In [None]:
z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
fig = plt.figure(figsize=(8,8))
ax = plt.contourf(xx,yy, z, cmap= 'afmhot',alpha=0.3)
plt.scatter(X.values[:,0], X.values[:,1],c=y, s=90, alpha=0.9,edgecolors='g')

In [None]:
clf.fit(X_kpca, y)

In [None]:
Xv = X_kpca
h = 0.02
x_min, x_max = Xv.min() - 0.5, Xv.max() + 0.5
y_min, y_max = y.min() - 0.5, y.max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

In [None]:
z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
fig = plt.figure(figsize=(10,8))
ax = plt.contourf(xx,yy, z, cmap= 'afmhot',alpha=0.3)
plt.scatter(X_kpca[:,0], X_kpca[:,1],c=y, s=80, alpha=0.9,edgecolors='g')

# Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
clf = LinearDiscriminantAnalysis()
clf.fit(X,y)

In [None]:
Xv = X.values.reshape(-1,1)
h = 0.02
x_min, x_max = Xv.min(), Xv.max() + 1
y_min, y_max = y.min(), y.max() + 2
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

In [None]:
z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
fig = plt.figure(figsize=(8,8))
ax = plt.contourf(xx,yy, z, cmap= 'afmhot',alpha=0.3)
plt.scatter(X.values[:,0], X.values[:,1],c=y, s=90, alpha=0.9,edgecolors='g')