### Principal Component Analysis 
dt. Hauptkomponentenanalyse
Idea: Reduce dimensions by finding net dimensions with maximal variance.

Dataset: https://www.kaggle.com/uciml/human-activity-recognition-with-smartphones/kernels

In [None]:
# import
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
# Read CSV
df = pd.read_csv("../res/pca_train.csv.bz2")
df.head()

In [None]:
# Prepare data
x = df.drop("subject", axis = 1).drop("Activity", axis = 1)
y = df["Activity"]

# Scale data
s = StandardScaler()
x = s.fit_transform(x)

In [None]:
# Decomposition using PCA (2D)
p = PCA(n_components = 2)
p.fit(x)

x_transformed = p.transform(x)

In [None]:
# Visualize data
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize = (16, 8))

for activity in y.unique():
    x_transformed_filtered = x_transformed[ y == activity, :]
    plt.scatter(x_transformed_filtered[:,0], x_transformed_filtered[:, 1], 
                label = activity, s = 2.5)

plt.legend()
plt.show()

In [None]:
# Print all unique classes of y
y.unique()

In [None]:
# Decomposition using PCA (3D)
p = PCA(n_components = 3)
p.fit(x)

x_transformed = p.transform(x)

In [None]:
# Visualize data
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize = (16, 8))
ax = fig.add_subplot(111, projection='3d')

for activity in y.unique():
    x_transformed_filtered = x_transformed[ y == activity, :]
    ax.scatter(
        x_transformed_filtered[:,0], 
        x_transformed_filtered[:, 1], 
        x_transformed_filtered[:, 2], 
        label = activity, 
        s = 4.0)

plt.legend()
plt.show()

In [None]:
# PCA for data compression
df_train = pd.read_csv("../res/pca_train.csv.bz2")
df_test = pd.read_csv("../res/pca_test.csv.bz2")

df_train.head()

In [None]:
# Prepare data
x_train = df_train.drop("subject", axis = 1).drop("Activity", axis = 1)
y_train = df_train["Activity"]

x_test = df_test.drop("subject", axis = 1).drop("Activity", axis = 1)
y_test = df_test["Activity"]

# Scale data
s = StandardScaler()
x = s.fit_transform(x)

In [None]:
# Fit PCA
p = PCA()
p.fit(x_train)

# Show how much variance covered by first 50 componens
print("First 50 dimensions have %.2f percent of variance" 
      % (np.sum(p.explained_variance_ratio_[:50])*100))

In [None]:
# How does first component consists
p.components_[0]

In [None]:
# PCA for data compression
df_train = pd.read_csv("../res/pca_train.csv.bz2")
df_test = pd.read_csv("../res/pca_test.csv.bz2")

In [None]:
# Prepare data
x_train = df_train.drop("subject", axis = 1).drop("Activity", axis = 1)
y_train = df_train["Activity"]

x_test = df_test.drop("subject", axis = 1).drop("Activity", axis = 1)
y_test = df_test["Activity"]

# Scale data
s = StandardScaler()
x_train = s.fit_transform(x_train)
x_test = s.transform(x_test)

In [None]:
# Convert data using PCA dimensions
p = PCA(n_components = 50)
p.fit(x_train)

x_train_transformed = p.fit_transform(x_train)
x_test_transformed = p.transform(x_test)

# Fit LogisticRegression on PCA transformed data
clf = LogisticRegression(solver = 'newton-cg', max_iter = 100)
clf.fit(x_train_transformed, y_train)

print("Training score on 50 dimensions (PCA) : %f" % clf.score(x_test_transformed, y_test))

In [None]:
# Logistic Regression without PCA
clf = LogisticRegression(solver = 'newton-cg', max_iter = 100)
clf.fit(x_train, y_train)

print("Training score on 561 dimensions (PCA) : %f" % clf.score(x_test, y_test))