In [None]:
from itertools import product, combinations, count, permutations, starmap, chain, repeat
import numpy as np
import pandas as pd
from astroquery.vizier import Vizier
import matplotlib.pyplot as plt
import astropy as ap
import sklearn
from sklearn.neighbors import KernelDensity as KD
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
import corner
from sklearn.mixture import GaussianMixture as GM
from sklearn.mixture import BayesianGaussianMixture as BGM
from sklearn.pipeline import Pipeline
from scipy.stats import norm
from matplotlib.colors import LogNorm
from astroquery.gaia import Gaia
from scipy import stats
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
Vizier = Vizier(row_limit=20000)

In [None]:
catalog = Vizier.get_catalogs("J/A+A/618/A93")

clucata = catalog[1]

In [None]:
newc = clucata.group_by('Cluster')
maxcluster = np.argmax(newc.groups.indices[1:]-newc.groups.indices[:-1])
clusterid = maxcluster+6
bigcluster = newc.groups[clusterid]
print(bigcluster.colnames)
newc.groups[clusterid]['Cluster'][0]

Good `'Cluster'`s to choose from: Alessi\_24, ASCC_99, Alessi\_12

In [None]:
cutcluster = bigcluster[bigcluster['PMemb']>.8]
cutcluster = cutcluster[~np.isnan(cutcluster["BP-RP"])]
#We should cut by lines away from main sequence
cutcluster = cutcluster[~np.logical_and(cutcluster["BP-RP"]>1.0,cutcluster['Gmag']<10.)]
cutcluster = cutcluster[~np.logical_and(cutcluster["BP-RP"]<.7, cutcluster['Gmag']>13.8)]
plt.plot(cutcluster['RA_ICRS'],cutcluster['DE_ICRS'],'+')
plt.title('angular coordinates of '+cutcluster['Cluster'][0])

In [None]:
query = 'SELECT source_id, phot_bp_mean_mag, phot_rp_mean_mag FROM gaiadr2.gaia_source WHERE source_id = {}'

In [None]:
newquery = query.format(' OR source_id = '.join(list(map(str, cutcluster['Source']))))
newquery;

In [None]:
Gaia.launch_job('SELECT source_id, phot_bp_mean_mag, phot_rp_mean_mag FROM gaiadr2.gaia_source WHERE source_id = 1842846157478382720')

In [None]:
newjob = Gaia.launch_job(query=newquery)

In [None]:
results = newjob.get_results()

In [None]:
results['phot_bp_mean_mag'].info.parent_table;

In [None]:
cutcluster.add_columns([results['phot_bp_mean_mag'],results['phot_rp_mean_mag']])

In [None]:
the_color = 'BP-RP'
the_mag = 'phot_rp_mean_mag'

In [None]:
plt.hist(cutcluster['PMemb'])
plt.xlabel('PMemb')
plt.title('cluster membership probability of '+cutcluster['Cluster'][0])

In [None]:
plt.plot(cutcluster[the_color],cutcluster[the_mag], '+')
plt.ylim(19, 7)
plt.xlabel('color')
plt.ylabel('magnitude')
plt.title('color-magnitude diagram of '+cutcluster['Cluster'][0])

PCA - diagram, inverse of variance as metric -> sqrt(thing)= transformation, apply ->PCA(test) -> KDE -> inverse transform both L and R on Kernel Widths

LLE? -> Local linear embedding -> for non MS

remove outliers before PCA

In [None]:
X = np.vstack([np.array(x) for x in [cutcluster[the_color], cutcluster[the_mag]]]).T

In [None]:
pca = PCA(n_components=2, whiten=True)
kde = KD(kernel='gaussian')
pipe = Pipeline([('pca',pca),('kde',kde)])

In [None]:
X_pipe = pipe.fit(X)

In [None]:
params = dict(kde__bandwidth=np.logspace(-2, 2, 200))
grid = GridSearchCV(pipe, params, cv=10)
grid.fit(X)

In [None]:
pipe=grid.best_estimator_.fit(X)

In [None]:
X_transform = pipe[0].transform(X)
plt.scatter(X_transform[:,0], X_transform[:,1])
plt.axis("equal")

In [None]:
# I couldn't immediately find a KDE code that enabled different bandwidth in each dimension
# which we want because the errors in color are much greater than the errors in magnitude
# params = {'bandwidth': np.logspace(-2, 2, 200)}
# grid = GridSearchCV(pipe, params, cv=10)
# grid.fit(X_pca)

print("best bandwidth: {0}".format(grid.best_estimator_[1].bandwidth))
# first attempt obviously too fine a bandwidth because it allows for double stars
# we could fix it here or say this is just what the data is and fit an HRD model that doesn't permit those
# so now the data is the KDE evaluated on a grid

xmin = -4
xmax = +4
ymin = -4
ymax = +4
xlen = 50
ylen = 50

eval_where = np.array(list(product(np.linspace(xmin,xmax,xlen), np.linspace(ymin,ymax, ylen))))
log_dens = pipe[1].score_samples(eval_where)

plt.imshow(np.flip(np.exp(log_dens.reshape(xlen, ylen).T), axis=0),
           extent=[xmin, xmax, ymin, ymax])
#plt.scatter(cutcluster['BP-RP'], cutcluster[the_mag], marker='.', color='r', s=1)

In [None]:
A, B = np.mgrid[xmin:xmax:xlen*1j,ymin:ymax:ylen*1j]

In [None]:
plt.contour(A, B, np.exp(log_dens.reshape(xlen,ylen)));

In [None]:
C_grid = np.vstack([np.ravel(A), np.ravel(B)]).T
E_grid = pipe[0].inverse_transform(C_grid)[:,0].reshape(xlen, ylen)
F_grid = pipe[0].inverse_transform(C_grid)[:,1].reshape(xlen, ylen)

In [None]:
plt.plot(cutcluster[the_color],cutcluster[the_mag], 'r+',alpha=.4)
plt.contour(E_grid, F_grid, np.exp(log_dens.reshape(xlen,ylen)))
plt.ylim(19,8)
plt.xlabel(the_color)
plt.ylabel(the_mag)
plt.title('color-magnitude diagram of '+cutcluster['Cluster'][0])
plt.savefig("cmd.png")

In [None]:
def get_cmsamps(nsamps):
    samps = np.array([[x,y] for x,y in pipe[1].sample(nsamps)])
    cmsamps = pipe[0].inverse_transform(samps)
    return (*(cmsamps[:,i] for i in [0,1]),)

In [None]:
colors, mags = get_cmsamps(200)

In [None]:
toflux, tomag = lambda m: 10.**(-2/5.*m), lambda ϕ: -5./2*np.log10(ϕ)

In [None]:
magsum = lambda m, c: tomag(sum(((toflux(g) for g in m))))
colorsum = lambda m, c: tomag(sum((toflux(g+h) for g,h in zip(m,c))))-magsum(m,c)

In [None]:
nwise = lambda n: lambda x,y: (map(np.array, zip(*a)) for a in combinations(zip(x,y),n))

In [None]:
makesums = lambda n: [np.fromiter(starmap(z, nwise(n)(mags, colors)), np.float64) for z in [magsum, colorsum]]

In [None]:
newmags, newcolors = makesums(2)

In [None]:
sns.kdeplot(colors,mags)
sns.kdeplot(newcolors, newmags)
plt.ylim(18,7)

### END OF REAL WORK