In [None]:
### Housekeeping ###
import os

### Data ###
import numpy as np
import pandas as pd
import rpy2.robjects as robjects
import scipy.io

### Visualization ###
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

### Machine Learning ###
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
dir = os.path.join(os.path.expanduser('~'), 'Downloads', 'pbl_medip_counts_matrix_blacklist_filtered.RData')
dir

In [None]:
robjects.r['load'](dir)

In [None]:
matrix = robjects.r['matri']
matrix

In [None]:
np_array = np.array(matrix)
np_array

In [None]:
df = pd.DataFrame(np_array, columns=matrix.colnames).T
df = df[:72]
df

In [None]:
X = df.values
X

In [None]:
y = df.index.values
y

In [None]:
scaled_X = StandardScaler().fit_transform(X)
scaled_X

In [None]:
pca = PCA(n_components=3)
principal_components = pca.fit_transform(scaled_X)
principal_df = pd.DataFrame(data=principal_components)
principal_df

In [None]:
final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1)
final_df.rename(columns={0:'sample'}, inplace=True)
final_df

In [None]:
fig = px.scatter(final_df,
                    x='PC1', y='PC2',
                    color='sample',
                    width=1000, height=800,
                    opacity=1
                    )

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='DarkSlateGrey')
                              ),
                  )

fig.show()

In [None]:
fig = px.scatter_3d(final_df,
                    x='PC1', y='PC2', z='PC3',
                    color='sample',
                    width=1000, height=800,
                    opacity=1
                    )

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='DarkSlateGrey')
                              ),
                  )

fig.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
final_df.rename_axis(None, inplace=True)
final_df

In [None]:
final_df.drop(columns=['sample']).values

In [None]:
inertias = []
ks = range(1,10)

for k in ks:
    km_test = KMeans(n_clusters=k, n_init=10).fit(final_df.drop(columns=['sample']).values)
    inertias.append(km_test.inertia_)

plt.plot(ks, inertias)
plt.xlabel('k cluster number');

In [None]:
k = 2

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(final_df.drop(columns=['sample']).values)
cluster_assignments = kmeans.labels_
final_df['cluster'] = cluster_assignments
final_df

In [None]:
plt.scatter(x=final_df['PC1'], y = final_df['PC2'], c=final_df['cluster'])
plt.show()

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Principal Component'); plt.ylabel('% explained variance');

In [None]:
pca.explained_variance_ratio_ * 100

In [None]:
sum(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.ylim(ymin=0)
plt.title('cumulated share of explained variance')
plt.xlabel('# of principal component used');