# Plotting data from "Twins and Weirdos" w/ mapper
https://arxiv.org/pdf/1711.00022.pdf  
https://github.com/ireis/APOGEE

In [33]:
import kmapper as km
from kmapper import jupyter # Creates custom CSS full-size Jupyter screen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import sklearn

%matplotlib inline

In [88]:
# This is 23 GB, probably need more RAM to load??
# Will start with just reading small chunks to get a look at data

data = np.load('full_dmat_f2.npy',mmap_mode='r')
dim = data.shape[0]
num_samples = 1000

# Randomly sample from the distance matrix
rand_idxs = np.array(np.random.choice(dim-1, num_samples))
#print(rand_idxs)

# I need to now shape this into a square distance matrix
dist_mat = data[rand_idxs[:,None],rand_idxs]
#print(dist_mat)


In [10]:
# Validating random sample extraction

print(data[99768,15089])

0.933


In [35]:
# Generating labels for data

from astropy.io import fits

allstar_fits_filename = 'allStar-l31c.2.fits'
with fits.open(allstar_fits_filename) as hdulist:

    allstar = pd.DataFrame([hdulist[1].data['APOGEE_ID'],
                        hdulist[1].data['RA'],
                        hdulist[1].data['DEC'],
                        hdulist[1].data['TEFF'],
                        hdulist[1].data['LOGG'],
                        hdulist[1].data['O_FE'],
                        hdulist[1].data['C_FE'],
                        hdulist[1].data['M_H']], 
                        index = ['APOGEE_ID', 
                        'RA', 
                        'DEC',
                        'TEFF',
                        'LOGG',
                        'O_FE',
                        'C_FE',
                        'M_H'])

allstar = allstar.T
allstar = allstar.set_index('APOGEE_ID')
#print(allstar.shape)
allstar = allstar[~allstar.index.duplicated(keep='first')]
#print(allstar.shape)
allstar.head()


# Issue is how to join labels with distance matrix points - the distance matrix doesn't appear to have labels

# Assuming now that the tsnecoor data set has the data in order 
tSNE_df = pd.read_csv('tsnecoor.dat.gz', delimiter='\s+', header=None, index_col=0 )
print(tSNE_df.shape)
tSNE_df.head()

print(tSNE_df.iloc[139707])


# Here's the mapping
dr13_locs = np.load('dr13_locs.npy')
print("dr13_locs size:",len(dr13_locs))
# cross objects are all the objects in DR13 that are also in DR14
# Returns all of the indices with such a mapping
cross_objs = np.where(dr13_locs >= 0)[0]
print("cross_objs size:",len(cross_objs))

dr13_locs_use = dr13_locs[cross_objs].astype(int)
print("locs_use:",len(dr13_locs_use))


nof_objects = data.shape[0]
print("# objects:",nof_objects)
# Need to populate this array so it's the same size as the data matrix
#tsne_x_use_full = np.ones(nof_objects)*np.nan
#tsne_y_use_full = np.ones(nof_objects)*np.nan
dr13_map_full = np.array(np.ones(nof_objects)*np.nan,dtype=str)
#print(dr13_map_full)

# Grabs indices of stars in tsnecoor
star_ident = np.array(tSNE_df.axes[0])
print("star-index length:",len(star_ident))

# I want to make an array dim long that has corresponding labels in order of DR13 matrix
# First way to do this is make an array of the right indices dim long
dr13_locs_use = star_ident[dr13_locs_use]
print("dr13 locs use:",len(dr13_locs_use))
dr13_map_full[cross_objs] = dr13_locs_use
print("full map size: ", len(dr13_map_full))


# Creating matrix with list of different attributes per star

# Convert dr13 mapping to df so I can merge attribute info with it
dr13_mapping = pd.DataFrame(dr13_map_full)
dr13_mapping.rename(columns={0:"ID"},inplace = True)
dr13_mapping.head()


# Create

# allstar_w_tsne = allstar.merge(tSNE_df, left_index=True, right_index=True, how = 'inner')
# print(allstar_w_tsne.shape)
# allstar_w_tsne.head()

dr13_attr = dr13_mapping.merge(allstar, left_on='ID', right_index=True, how="left")
print(dr13_attr.shape)
dr13_attr.head()


(193556, 2)
1    19.973024
2     0.459540
Name: 2M19415577+3919361, dtype: float64
dr13_locs size: 107390
cross_objs size: 106112
locs_use: 106112
# objects: 107390
star-index length: 193556
dr13 locs use: 106112
full map size:  107390
(107390, 8)


Unnamed: 0,ID,RA,DEC,TEFF,LOGG,O_FE,C_FE,M_H
0,2M19415577+3919361,295.482,39.3267,4807.37,3.17617,0.0245807,-0.0708137,0.127862
1,2M06082176+1351359,92.0907,13.86,4399.25,1.9477,0.0571017,-0.0237896,-0.255092
2,2M17173422+4236157,259.393,42.6044,5319.26,-9999.0,0.107098,-0.0109298,-0.0746212
3,,,,,,,,
4,,,,,,,,


In [89]:
# Creating label for mapper from sampled dataset


# Effective temp
attrs = dr13_attr['TEFF']
attrs = np.array(attrs[rand_idxs])
attrs = np.where(attrs==-9999,0,attrs)
attrs = np.where(pd.isnull(attrs),0,attrs)
# Compute mean
attrs_mean = np.mean(attrs[np.where(attrs != 0)])
#print(attrs_mean)
# Plug mean into zeroes
attrs = np.where(attrs==0,attrs_mean,attrs)
#attrs = np.where(attrs==nan)
#print(attrs)



In [90]:
# Running mapper


# Initialize
mapper = km.KeplerMapper(verbose=1)

# Project by L2-norm on my distance matrix
X_projected = mapper.project(
     dist_mat,
     projection="l2norm"
 )

# try eccentricity?
# X_projected = mapper.project(
#     dist_mat,
#     projection="")



# # Fit to and transform the data
# projected_data = mapper.fit_transform(data, projection=[0,1]) # X-Y axis


# What to use for clusterer?
my_clusterer = sklearn.cluster.DBSCAN(eps=0.45, min_samples=5)


# Create dictionary called 'graph' with nodes, edges and meta-information
graph = mapper.map(X_projected, dist_mat, cover=km.Cover(n_cubes=20,perc_overlap=0.3),
                   precomputed = True, clusterer = my_clusterer)


# Visualizing in jupyter notebook

# Visualize it
html = mapper.visualize(graph, path_html="twins_weirdos_mapper.html",
                 title="Twins & Weirdos + mapper",
                       color_function = attrs)

# Inline display
# jupyter.display(path_html="http://mlwave.github.io/tda/word2vec-gender-bias.html")
jupyter.display(path_html="twins_weirdos_mapper.html")

KeplerMapper()
..Projecting on data shaped (1000, 1000)

..Projecting data using: l2norm

..Scaling with: MinMaxScaler(copy=True, feature_range=(0, 1))

Mapping on data shaped (1000, 1000) using lens shaped (1000, 1)

Creating 20 hypercubes.

Created 24 edges and 29 nodes in 0:00:00.113342.
Wrote visualization to: twins_weirdos_mapper.html


