In [1]:
# imports and setup 

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, AgglomerativeClustering

from sklearn import tree, svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.cluster import DBSCAN

from sklearn.decomposition import PCA 

import nltk
from nltk.corpus import stopwords

import re

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('ggplot')

In [None]:
df = pd.read_csv(".csv")
df
#df = df.rename(columns = {"Unnamed: 0": "States"})

In [None]:
from matplotlib import cm
df2 = df
df2 = df.set_index('States')
states = df.pop("States")

In [None]:
g = sns.clustermap(df2, method="average", metric="euclidean", cmap="Blues", standard_scale = 1)

In [None]:
viridis = cm.get_cmap('viridis', 50)
color_arr = viridis(np.linspace(0, 1, 50))

# the color for the labels
lut = dict(zip(states.unique(), color_arr))

row_colors = states.map(lut)

# for method, try "single", "average", "ward"
# for method try "correlation", "euclidean", "cityblock"
g = sns.clustermap(df, method="average", metric="euclidean", row_colors=row_colors, cmap="Blues", standard_scale = 1)

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])

In [None]:
fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
# Variance ratio of the four principal components
var_ratio = pca_model.explained_variance_ratio_
print(var_ratio)

plt.plot([1,2,3,4], var_ratio, '-o')

plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0,4.25)
plt.ylim(0,1.05)
plt.xticks([1,2,3,4])
plt.show()

In [None]:
fig,ax1 = plt.subplots()

scaled = scale(df)

y_pred = KMeans(n_clusters=4, n_init=1, init='random', max_iter=5).fit_predict(scaled)
plt.scatter(scaled[:, 0], scaled[:, 1], c=y_pred, cmap = 'spring');

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (scaled[i, 0], scaled[i, 1]), ha='center',fontsize=5)

plt.show()

In [None]:
ks = range(1,15)
scores = []

for k in ks:
    model = KMeans(n_clusters=k)
    model.fit_predict(df)
    scores.append(-model.score(df))

plt.plot(ks, scores)
plt.ylabel('total intra-cluster distance')
plt.xlabel('k')
plt.show()

In [None]:
fig,ax1 = plt.subplots()

scaled = scale(df)

y_pred = KMeans(n_clusters=4, n_init=1, init='random', max_iter=5).fit_predict(scaled)
plt.scatter(scaled[:, 0], scaled[:, 1], c=y_pred, cmap = 'spring');

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (scaled[i, 0], scaled[i, 1]), ha='center',fontsize=5)

plt.show()

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])
df_pca

fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
scaled = scale(df)

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])

fig,ax1 = plt.subplots()

y_pred = KMeans(n_clusters=4, n_init=1, init='random', max_iter=5).fit_predict(df_pca)
plt.scatter(df_pca.iloc[:, 0], df_pca.iloc[:, 1], c=y_pred, cmap = 'spring');

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
from scipy.cluster import hierarchy

In [None]:
Z = hierarchy.linkage(df, 'ward') # generate the linkage array

fig,ax1 = plt.subplots()

y_pred = hierarchy.fcluster(Z=Z, t=4, criterion='maxclust')
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=y_pred,  marker="o", cmap='spring'); 
    
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (df.iloc[i, 0], df.iloc[i, 1]), ha='center',fontsize=5)

plt.show()

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])
df_pca

fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])
df_pca

Z = hierarchy.linkage(df_pca, 'ward') 

fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

y_pred = hierarchy.fcluster(Z=Z, t=4, criterion='maxclust')
plt.scatter(df_pca.iloc[:, 0], df_pca.iloc[:, 1], c=y_pred,  marker="o", cmap='spring'); 

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
db_model = DBSCAN(eps=20, min_samples=4)
db_model.fit(df)
y_pred = db_model.fit_predict(df)

fig,ax1 = plt.subplots()

plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=y_pred,  marker="o", cmap= 'spring'); 

labels = db_model.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

for i,name in enumerate(states.values):
    ax1.annotate(states[i], (df.iloc[i,0], df.iloc[i,1]), ha='center',fontsize=5)

plt.show()

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])
df_pca

fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()

In [None]:
scaled = scale(df)

pca_model = PCA()
X_PCA = pca_model.fit_transform(scaled)

# create a new pandas dataframe 
df_pca = pd.DataFrame(X_PCA, columns=['PC1', 'PC2', 'PC3', 'PC4'])

db_model = DBSCAN(eps=1, min_samples=2)
db_model.fit(df_pca)
y_pred = db_model.fit_predict(df_pca)

fig,ax1 = plt.subplots()

ax1.set_xlim(X_PCA[:,0].min()-1,X_PCA[:,0].max()+1)
ax1.set_ylim(X_PCA[:,1].min()-1,X_PCA[:,1].max()+1)

plt.scatter(df_pca.iloc[:, 0], df_pca.iloc[:, 1], c=y_pred,  marker="o", cmap= 'spring'); 

labels = db_model.labels_

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# Plot Principal Components 1 and 2
for i,name in enumerate(states.values):
    ax1.annotate(states[i], (X_PCA[i,0], X_PCA[i,1]), ha='center',fontsize=5)

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.show()