In [None]:
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import mnist

In [None]:
train_image = mnist.train_images()
train_labels = mnist.train_labels()
test_image = mnist.test_images()
test_labels = mnist.test_labels()

In [None]:
train_img = mnist.train_images()
train_lbl = mnist.train_labels()
test_img = mnist.test_images()
test_lbl = mnist.test_labels()

In [None]:
train_img = train_img.reshape((-1, 784))

In [None]:
train_img.shape

In [None]:
print(train_lbl.shape)

In [None]:
print(test_img.shape)

In [None]:
test_img = test_img.reshape((-1,784))

In [None]:
test_img.shape

In [None]:
print(test_lbl.shape)



Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

Notebook going over the importance of feature Scaling: http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(.75)

In [None]:
pca.fit(train_img)

In [None]:
pca.n_components_

In [None]:
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

In [None]:
train_image[0]

In [None]:
train_img[0]

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logisticRegr_pca = LogisticRegression(solver = 'lbfgs')

In [None]:
logisticRegr.fit(train_img, train_lbl)

In [None]:
# test_img[0].shape

In [None]:
# test_img[0].reshape(1,-1).shape

In [None]:
# Returns a NumPy Array
# Predict for One Observation (image)
logisticRegr.predict(test_img[0].reshape(1,-1))

In [None]:
logisticRegr.predict(test_img[0:10])

In [None]:
train_image[11].shape

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
first_image = train_image[11]
plt.imshow(first_image, cmap='gray')
plt.show()

In [None]:
# train_img[11]

In [None]:
# import numpy as np
# np.pad(train_img[11], (28, 28), 'constant')

In [None]:
test_img.shape

In [None]:
score = logisticRegr.score(test_img, test_lbl)
print(score)

In [None]:
logistic_original_dataset = logisticRegr.fit(train_image.reshape(-1,784), train_labels)

In [None]:
logistic_original_dataset.score(test_image.reshape(-1,784), test_labels)

In [None]:
# reference : 
# https://github.com/mGalarnyk/Python_Tutorials/blob/master/Sklearn/PCA/PCA_to_Speed-up_Machine_Learning_Algorithms.ipynb
# https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

In [None]:
# loading dataset into Pandas DataFrame
df = pd.read_csv(url
                 , names=['sepal length','sepal width','petal length','petal width','target'])

In [None]:
df.head()

In [None]:
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
x = df.loc[:, features].values

In [None]:
y = df.loc[:,['target']].values

In [None]:
x = StandardScaler().fit_transform(x)

In [None]:
pd.DataFrame(data = x, columns = features).head()

In [None]:
pca = PCA(n_components=2)

In [None]:
principalComponents = pca.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
principalDf.head(5)

In [None]:
df[['target']].head()

In [None]:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf.head(5)

In [None]:
indicesToKeep = finalDf['target'] == target
from collections import Counter
Counter(indicesToKeep)

In [None]:
target

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()