In [10]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Dimension reduction and clustering libraries
import umap.umap_ as umap
import umap.plot as uplot
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [11]:
embeddings=pd.read_csv('pure_embeddings.csv').set_index('news_id')
news = pd.read_csv('MIND_small/csv/news.csv').drop(columns=['title', 'abstract','url','title_entities','abstract_entities']).set_index('news_id').dropna()

In [12]:
news.drop(columns=['Unnamed: 0']).head()

Unnamed: 0_level_0,category,sub_category
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1
N55528,lifestyle,lifestyleroyals
N19639,health,weightloss
N61837,news,newsworld
N53526,health,voices
N38324,health,medical


In [13]:
total = pd.concat([news.drop(columns=['Unnamed: 0']), embeddings], axis=1)
total.head()

Unnamed: 0_level_0,category,sub_category,0_abstract,1_abstract,2_abstract,3_abstract,4_abstract,5_abstract,6_abstract,7_abstract,...,758_title,759_title,760_title,761_title,762_title,763_title,764_title,765_title,766_title,767_title
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,lifestyle,lifestyleroyals,-0.678508,0.510086,0.999818,-0.989043,0.946416,0.872465,0.983302,-0.989201,...,0.784108,0.066547,0.084478,-0.474509,-0.23388,-0.330869,-0.392143,0.999986,-0.325422,0.990003
N19639,health,weightloss,-0.689825,0.470679,0.999851,-0.994703,0.969886,0.945957,0.989684,-0.996318,...,0.92366,-0.199729,0.2982,-0.371616,-0.024119,-0.210733,-0.371494,0.999987,-0.404185,0.982506
N61837,news,newsworld,-0.649138,0.332447,0.999478,-0.9753,0.930517,0.928687,0.952428,-0.996233,...,0.756771,-0.275562,0.560876,-0.562148,-0.324376,-0.294702,-0.252834,0.999979,-0.583293,0.985602
N53526,health,voices,-0.620251,0.378284,0.999805,-0.993509,0.971419,0.956017,0.990925,-0.997767,...,0.788761,-0.043666,0.24967,-0.392529,-0.044873,-0.389453,-0.319429,0.999984,-0.537654,0.991666
N38324,health,medical,-0.737245,0.456646,0.999699,-0.98731,0.945211,0.84417,0.972281,-0.987509,...,0.71287,-0.0605,0.096479,-0.199158,-0.050173,0.004713,-0.213084,0.999963,-0.298894,0.985555


In [14]:
total.dropna(inplace=True)

In [15]:
embeddings.dtypes

0_abstract    float64
1_abstract    float64
2_abstract    float64
3_abstract    float64
4_abstract    float64
               ...   
763_title     float64
764_title     float64
765_title     float64
766_title     float64
767_title     float64
Length: 1536, dtype: object

In [16]:
# embeddings[[str(i) + '_abstract' for i in range(0, 700)]]
subset = total[['category','sub_category'] + [str(i) + '_abstract' for i in range(0, 10)]]
subset.head()

Unnamed: 0_level_0,category,sub_category,0_abstract,1_abstract,2_abstract,3_abstract,4_abstract,5_abstract,6_abstract,7_abstract,8_abstract,9_abstract
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
N55528,lifestyle,lifestyleroyals,-0.678508,0.510086,0.999818,-0.989043,0.946416,0.872465,0.983302,-0.989201,-0.968395,-0.679993
N19639,health,weightloss,-0.689825,0.470679,0.999851,-0.994703,0.969886,0.945957,0.989684,-0.996318,-0.974794,-0.706272
N61837,news,newsworld,-0.649138,0.332447,0.999478,-0.9753,0.930517,0.928687,0.952428,-0.996233,-0.912072,-0.400712
N53526,health,voices,-0.620251,0.378284,0.999805,-0.993509,0.971419,0.956017,0.990925,-0.997767,-0.974476,-0.661895
N38324,health,medical,-0.737245,0.456646,0.999699,-0.98731,0.945211,0.84417,0.972281,-0.987509,-0.93806,-0.693719


In [17]:
subset[subset.columns[2:]]

Unnamed: 0_level_0,0_abstract,1_abstract,2_abstract,3_abstract,4_abstract,5_abstract,6_abstract,7_abstract,8_abstract,9_abstract
news_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
N55528,-0.678508,0.510086,0.999818,-0.989043,0.946416,0.872465,0.983302,-0.989201,-0.968395,-0.679993
N19639,-0.689825,0.470679,0.999851,-0.994703,0.969886,0.945957,0.989684,-0.996318,-0.974794,-0.706272
N61837,-0.649138,0.332447,0.999478,-0.975300,0.930517,0.928687,0.952428,-0.996233,-0.912072,-0.400712
N53526,-0.620251,0.378284,0.999805,-0.993509,0.971419,0.956017,0.990925,-0.997767,-0.974476,-0.661895
N38324,-0.737245,0.456646,0.999699,-0.987310,0.945211,0.844170,0.972281,-0.987509,-0.938060,-0.693719
...,...,...,...,...,...,...,...,...,...,...
N17258,-0.618489,0.464543,0.999628,-0.974870,0.929978,0.757753,0.966552,-0.970493,-0.933629,-0.651518
N23858,-0.632591,0.450974,0.999812,-0.988964,0.946670,0.778630,0.980280,-0.977038,-0.968778,-0.662544
N16909,-0.792380,0.611406,0.999981,-0.994471,0.969889,0.769428,0.988007,-0.911938,-0.988090,-0.712895
N7482,-0.694298,0.404148,0.999844,-0.993438,0.954311,0.865981,0.985001,-0.991658,-0.956053,-0.680331


In [18]:
embedding_mapper = umap.UMAP().fit(subset[subset.columns[2:]])

: 