## Topic clustering via Non-negative matrix factorization (NMF)
Lecture note: unsupervided learning (DataCamp)

In [2]:
import pandas as pd
from   scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster       import KMeans
from sklearn.pipeline      import make_pipeline
from pathlib               import Path
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import os

In [3]:
# set root directory
path_root = Path("C:/Users/giann/data-science-core")
os.chdir(path_root)
print(f'- Root directory = {os.getcwd()}')

- Root directory = C:\Users\giann\data-science-core


### import data
To preprocess `wikipedia-vectors.csv` into the format in which you used it in the exercises, you have to take its transpose:
The reason for taking this transpose is that without it, there would be 13,000 columns 
(corresponding to the 13,000 words in the file), which is a lot of columns for a CSV to have.

In [4]:
path_dataset = path_root / 'dataset/wikipedia-articles2/wikipedia-vectors.csv'
df           = pd.read_csv(path_dataset, index_col = 0)
articles     = csr_matrix(df.transpose())
titles       = list(df.columns)
articles

<60x13125 sparse matrix of type '<class 'numpy.float64'>'
	with 42091 stored elements in Compressed Sparse Row format>

### Apply NMF
Note that all values need to be positive

In [6]:
# Build recommender system finding article similarities
# Create an NMF instance: model
model = NMF(n_components = 6)
# Fit the model to articles
model.fit(articles)
# Transform the articles: nmf_features
nmf_features = model.transform(articles)
# Print the NMF features
print(nmf_features.shape)

(60, 6)


In [7]:
# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)
# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=titles)
# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']
# Compute the dot products: similarities
similarities = df.dot(article)
# Display those with the largest cosine similarity
print(similarities.nlargest())

Cristiano Ronaldo                1.000000
Franck Ribéry                    0.999972
Radamel Falcao                   0.999942
Zlatan Ibrahimović               0.999942
France national football team    0.999923
dtype: float64
