# Unsupervised Learning in Python

## Clustering methods

### k-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=3)
model.fit(points)
labels = model.predict(new_points)

Centroids:

In [None]:
centroids = model.cluster_centers_
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]

Plotting:

In [None]:
xs = new_points[:,0]
ys = new_points[:,1]

plt.scatter(xs, ys, c=labels, alpha=0.5)

# Assign the columns of centroids: centroids_x, centroids_y
centroids_x = centroids[:,0]
centroids_y = centroids[:,1]

# Make a scatter plot of centroids_x and centroids_y
plt.scatter(centroids_x, centroids_y, marker='D', s=50)
plt.show()

#### Evaluation k

Plot:

In [None]:
ks = range(1, 6)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters = k)
    
    # Fit model to samples
    model.fit(samples)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

Crosstab

In [None]:
pd.crosstab(df['labels'], df['varieties'])

### Hierarchical clustering

In [None]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(samples, method='complete')

# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
           labels=varieties,
           leaf_rotation=90,
           leaf_font_size=6,
)
plt.show()

SciPy hierarchical clustering doesn't fit into a sklearn pipeline, so you'll need to use the normalize() function from sklearn.preprocessing instead of Normalizer.

In [None]:
from sklearn.preprocessing import normalize

normalized_movements = normalize(movements)

To get the labels (results) from the hierarchical clustering, you can apply the fclust function, specifying the 'cutting' height.

In [None]:
labels = fcluster(mergings, 6, criterion='distance')

## Scaling

In [None]:
# Perform the necessary imports
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Create scaler: scaler
scaler = StandardScaler()

# Create KMeans instance: kmeans
kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, kmeans)

In [None]:
# Import Normalizer
from sklearn.preprocessing import Normalizer

# Create a normalizer: normalizer
normalizer = Normalizer()

# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters=10)

# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(normalizer, kmeans)

# Fit pipeline to the daily price movements
pipeline.fit(movements)

## Visualisation

### t-SNE

In [None]:
from sklearn.manifold import TSNE

Create a TSNE instance: model

In [None]:
model = TSNE(learning_rate=200)

In [None]:
tsne_features = model.fit_transform(samples)

Plot:

In [None]:
# Select the 0th feature: xs
xs = tsne_features[:,0]

# Select the 1st feature: ys
ys = tsne_features[:,1]

# Scatter plot, coloring by variety_numbers
plt.scatter(xs, ys, c=variety_numbers)
plt.show()

## PCA

Calculate correlation

In [None]:
from scipy.stats as pearsonr

correlation, pvalue = pearsonr(width, length)

Decorrelate:

In [None]:
from sklearn.decomposition import PCA

In [None]:
model = PCA()

pca_features = model.fit_transform(grains)

In [None]:
# Get the mean of the grain samples: mean
mean = model.mean_

# Get the first principal component: first_pc
first_pc = model.components_[0,:]

Plot PCS

In [None]:
# Get the first principal component: first_pc
first_pc = model.components_[0,:]

# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)

# Keep axes on same scale
plt.axis('equal')
plt.show()

### Explained variance

In [None]:
# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

### Number of components chosen

In [None]:
# Create a PCA model with 2 components: pca
pca = PCA(n_components=2)

pca.fit(scaled_samples)

pca_features = pca.transform(scaled_samples)

### Sparse matrices for tf-df (word frequencies)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer() 

Apply fit_transform to document: csr_mat

In [None]:
csr_mat = tfidf.fit_transform(documents)

In [None]:
words = tfidf.get_feature_names()

#### Applying PCA to sparse matrices

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
svd = TruncatedSVD(n_components=50)

## Non-negative matrix factorization (NMF)

In [None]:
from sklearn.decomposition import NMF

Create an NMF instance: model

In [None]:
model = NMF(n_components=6)

Fit the model to articles

In [None]:
model.fit(articles)

Transform the articles: nmf_features

In [None]:
nmf_features = model.transform(articles)

In [None]:
model.components_

Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronaldo. The NMF features you obtained earlier are available as nmf_features, while titles is a list of the article titles.

In [None]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

In [None]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)

# Select row 3: component
component = components_df.iloc[3,]

# Print result of nlargest
print(component.nlargest())

In [None]:
# Perform the necessary imports
import pandas as pd
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features, titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']

# Compute the dot products (cosine similarities)
similarities = df.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest())

In this exercise and the next, you'll use what you've learned about NMF to recommend popular music artists! You are given a sparse array _artists_ whose rows correspond to artists and whose column correspond to users. The entries give the number of times each artist was listened to by each user.

In this exercise, build a pipeline and transform the array into normalized NMF features. The first step in the pipeline, _MaxAbsScaler_, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to. In the next exercise, you'll use the resulting normalized NMF features for recommendation!

This data is part of a larger dataset available [here](http://www-etud.iro.umontreal.ca/~bergstrj/audioscrobbler_data.html).

In [None]:
# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)

# Apply fit_transform to artists: norm_features
norm_features = pipeline.fit_transform(artists)

Suppose you were a big fan of Bruce Springsteen - which other musicial artists might you like? Use your NMF features from the previous exercise and the cosine similarity to find similar musical artists. A solution to the previous exercise has been run, so *norm_features* is an array containing the normalized NMF features as rows. The names of the musical artists are available as the list *artist_names*.

In [None]:
# Import pandas
import pandas as pd

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=artist_names)

# Select row of 'Bruce Springsteen': artist
artist = df.loc['Bruce Springsteen']

# Compute cosine similarities: similarities
similarities = df.dot(artist)

# Display those with highest cosine similarity
print(similarities.nlargest())