In [None]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
from matplotlib import colors
import matplotlib.cm as cmx
import matplotlib.gridspec as gridspec
import numpy as np
np.set_printoptions(threshold='nan')
import seaborn as sns
import sys
if sys.version_info[0] < 3:
    import Tkinter as Tk
else:
    import tkinter as Tk
    
from tkFileDialog import askopenfilename,askdirectory,asksaveasfile
sys.path.append('python')
from clusterOutliers import clusterOutliers
import keplerml

## Use keplerml.py to extract features from lightcurves
# Don't run this cell if you've already done this

In [None]:
""" 
Use the following to extract features from the lightcurves in the 
Training_set_lightcurves (formerly for training a classifying technique
now for training new users).
filelist - path to a filelist with all relevant files listed by line
fitsDir - path to location of fits files
of - output file destination, should be csv

As it is, this will produce out.csv, a pandas dataframe 
with the calculated features, in the data/output folder.
It will overwrite files by the same name.
"""

filelist = 'data/filelists/filelist.txt'
fitsDir = 'data/Training_set_lightcurves'
of = 'data/output/out.csv'
keplerml.features_from_filelist(filelist,fitsDir,of,verbose=True)

## Import the features created with keplerml.py

In [None]:
sample.files.size

In [None]:
# User defined
featCSV = "data/output/out.csv" # Path to csv containing feature data (should be a pandas dataframe saved as a csv)
fitsDir = "data/Training_set_lightcurves" # path to fits files

sample = clusterOutliers(featCSV,fitsDir)
# Create a random sampling of the data from the imported files.
# Use randomSampleWTabby method to ensure Tabby star is included.
randomSample = sample.randSample(102) # choose number of samples

"""
# Example for looping through 17 quarters of kepler data.
Q = np.zeros(17)
for i in range(1,18):
    featCSV = 'Q%s.csv'%i
    fitsDir = 'KeplerLCs/fitsfiles/Q%sfitsfiles'%i
    Q[i-1] = clusterOutliers(featCSV,fitsDir)
    randSample[i-1] = Q[i-1].randSample(5000)
"""

In [None]:
"""
sample_tsne_fit reduces the dimensionality of the sample created using randSample or
randSampleWTabby.
tsne_fit can be used on other data frames and will return a 2D set of coordinates
corresponding to the original data.
"""
try:
    sample.sample_tsne_fit()
except ValueError:
    print("Something went wrong, too few samples maybe? That's been an issue.")
    
"""
# Example for looping through 17 quarters of kepler data.
tsneFit = np.zeros(17)
for i in range(17):
    tsneFit[i] = Q[i].tsne_fit(Q[i].dataSample)
"""

In [None]:
"""
sample_km_out and sample_db_out operate on the sample generated with sample_tsne_fit.
km_out and db_out can be used on any dataframe with arbitrary dimensionality. These
will return the cluster labels with outliers labeled as -1.
"""
print("K-means")
sample.sample_km_out()
print("DBSCAN")
sample.sample_db_out()

"""
# Example for looping through 17 quarters of kepler data.
for i in range(17):
    Q[i].km_out()
    Q[i].db_out()
"""

In [None]:
"""Run this before plotting in following cells.

This only works for plotting the sample generated with randSample or randSampleWTabby
reduced with sample_tsne_fit, then clustered with sample_km_out and/or sample_db_out.

If seeking to plot using the following cell, the variables in this cell will
need to be defined.

files - an array containing the names of the data source
clusterLabels - an array containing labels for each of the files
data - an array containing 2 coordinates for each file
cNorm - colors.Normalize(vmin=0, vmax=max(clusterLabels))
scalarMap - cmx.ScalarMappable(norm=cNorm, cmap='jet')
tsneX - x coordinates for all files (perhaps the 0th index for each pt in data: data.T[0])
tsneY - y coordinates "" (perhaps the 1st index for each pt in data: data.T[1])
outX,outY,files_out,clusterX,clusterY,files_cluster:

    for i in enumerate(data):
        if clusterLabels[i[0]] == -1:
            outX.append(i[1][0])
            outY.append(i[1][1])
            files_out.append(files[i[0]])
        else:
            clusterX.append(i[1][0])
            clusterY.append(i[1][1])
            files_cluster.append(files[i[0]])
lightcurveData - an array containing the light curve arrays [t,nf,err], 
    corresponding to each file, where t, nf, and err are arrays.
tabbyInd - the index of Tabby star in the files array
if data is a data frame where its indices are the filenames you may use:

    if data.index.str.contains('8462852').any():
        if len(tabbyCheck)!=0:
            tabbyInd = list(self.index).index('8462852')
        else:
            tabbyInd = 0
"""
method = 'dbscan' # method is the clustering method we'd like to examine graphically.
#method = 'kmeans'


files,clusterLabels,data,\
cNorm,scalarMap,tsneX,tsneY,outX,\
outY,files_out,clusterX,clusterY,\
files_cluster,lightcurveData,tabbyInd=sample.import_for_plot(method)

"""
files,clusterLabels,data,\
cNorm,scalarMap,tsneX,tsneY,outX,\
outY,files_out,clusterX,clusterY,\
files_cluster,lightcurveData,tabbyInd=Q[i].import_for_plot(method) # only plot one quarter at a time.
"""

In [None]:
%matplotlib tk 
# Sets the backend to TkAgg

root = Tk.Tk()
root.wm_title("Scatter")

if sample.importedForPlotting:
    fig = Figure(figsize=(20,10))

    # a tk.DrawingArea
    canvas = FigureCanvasTkAgg(fig, master=root)
    canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)
    # Toolbar to help navigate the data (pan, zoom, save image, etc.)
    toolbar = NavigationToolbar2TkAgg(canvas, root)
    toolbar.update()
    canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1)

    gs = gridspec.GridSpec(2,6)

    with sns.axes_style("white"):
        # empty subplot for scattered data
        ax = fig.add_subplot(gs[0,:4])
        # empty subplot for lightcurves
        ax2 = fig.add_subplot(gs[1,:])
        # empty subplot for center detail
        ax3 = fig.add_subplot(gs[0,4:])

    def distance(point, event):
        """Return distance between mouse position and given data point

        Args:
            point (np.array): np.array of shape (3,), with x,y,z in data coords
            event (MouseEvent): mouse event (which contains mouse position in .x and .xdata)
        Returns:
            distance (np.float64): distance (in screen coords) between mouse pos and data point
        """
        assert point.shape == (2,), "distance: point.shape is wrong: %s, must be (2,)" % point.shape
        x2,y2 = ax.transData.transform((point[0],point[1]))

        return np.sqrt ((x2 - event.x)**2 + (y2 - event.y)**2)

    def calcClosestDatapoint(XT, event):
        """Calculate which data point is closest to the mouse position.

        Args:
            XT (np.array) - array of points, of shape (numPoints, 2)
            event (MouseEvent) - mouse event (containing mouse position)
        Returns:
            smallestIndex (int) - the index (into the array of points X) of the element closest to the mouse position
        """
        distances = [distance (XT[:,i], event) for i in range(XT.shape[1])]

        return np.argmin(distances)

    def drawData(X, index):
        # Plots the lightcurve of the point chosen
        ax2.cla()

        x=X[index][0]
        y=X[index][1]

        axrange=0.55*(max(y)-min(y))
        mid=(max(y)+min(y))/2
        yaxmin = mid-axrange
        yaxmax = mid+axrange
        if yaxmin < .95:
            if yaxmax > 1.05:
                ax2.set_ylim(yaxmin,yaxmax)
            else:
                ax2.set_ylim(yaxmin,1.05)
        elif yaxmax > 1.05:
            ax2.set_ylim(.95,yaxmax)
        else:
            ax2.set_ylim(.95,1.05)

        if files[index] in files_cluster:
            color = 'blue'
        else:
            color = 'red'
        ax2.plot(x, y, 'o',markeredgecolor='none', c=color, alpha=0.2)
        ax2.plot(x, y, '-',markeredgecolor='none', c=color, alpha=0.7)
        #ax2.set_title(files[index][:13],fontsize = 20)
        ax2.set_xlabel('Time (Days)',fontsize=22)
        ax2.set_ylabel(r'$\frac{\Delta F}{F}$',fontsize=30)

        fig.suptitle(files[index][:13],fontsize=30)

        canvas.draw()

    def annotatePt(XT, index):
        """Create popover label in 3d chart

        Args:
            X (np.array) - array of points, of shape (numPoints, 3)
            index (int) - index (into points array X) of item which should be printed
        Returns:
            None
        """
        x2, y2 = XT[index][0], XT[index][1]
        # Either update the position, or create the annotation
        if hasattr(annotatePt, 'label'):
            annotatePt.label.remove()
            annotatePt.emph.remove()
        if hasattr(annotatePt, 'emphCD'):
            annotatePt.emphCD.remove()

        # Get data point from array of points X, at position index
        annotatePt.label = ax.annotate( "",
            xy = (x2, y2), xytext = (x2+10, y2+10),
            arrowprops = dict(headlength=20,headwidth=20,width=6,shrink=.1,color='red'))
        annotatePt.emph = ax.scatter(x2,y2,marker='o',s=50,c='red')
        if files[index] in files_cluster:
            annotatePt.emphCD = ax3.scatter(x2,y2,marker='o',s=150,c='red')
        else:
            annotatePt.emphCD = ax.scatter(x2,y2,marker='o',s=50,c='red')
        canvas.draw()


    def onMouseClick(event, X):
        """Event that is triggered when mouse is clicked. Shows lightcurve for data point closest to mouse."""
        XT = np.array(X.T) # array organized by feature, each in it's own array
        closestIndex = calcClosestDatapoint(XT, event)
        drawData(lightcurveData, closestIndex)

    def onMouseRelease(event, X):
        XT = np.array(X.T)
        closestIndex = calcClosestDatapoint(XT, event)
        annotatePt(X,closestIndex)
        #for centerIndex in centerIndices:
        #    annotateCenter(XT,centerIndex)

    def connect(X):
        if hasattr(connect,'cidpress'):
            fig.canvas.mpl_disconnect(connect.cidpress)
        if hasattr(connect,'cidrelease'):
            fig.canvas.mpl_disconnect(connect.cidrelease)

        connect.cidpress = fig.canvas.mpl_connect('button_press_event', lambda event: onMouseClick(event,X))
        connect.cidrelease = fig.canvas.mpl_connect('button_release_event', lambda event: onMouseRelease(event, X))

    def redraw():       
        # Clear the existing plots
        ax.cla()
        ax2.cla()
        ax3.cla()
        # Set those labels
        ax.set_xlabel("T-SNE X",fontsize=18)
        ax.set_ylabel("T-SNE Y",fontsize=18)
        # Scatter the data
        ax.scatter(outX, outY,c="black",s=30,cmap='jet')

        ax.hexbin(clusterX,clusterY,mincnt=5,bins="log",cmap="inferno",gridsize=35)
        hb = ax3.hexbin(clusterX,clusterY,mincnt=5,bins="log",cmap="inferno",gridsize=35)
        cb = fig.colorbar(hb)
        """
        ax.scatter(clusterX,clusterY,s=30,c='g')
        ax3.scatter(clusterX,clusterY)
        """
        ax3.set_title("Center Density Detail")
        ax3.set_xlabel("T-SNE X",fontsize=18)
        ax3.set_ylabel("T-SNE Y",fontsize=18)

        #for centerIndex in centerIndices:
        #    annotateCenter(currentData1,centerIndex)

        if hasattr(redraw,'cidenter'):
                fig.canvas.mpl_disconnect(redraw.cidenter)
                fig.canvas.mpl_disconnect(redraw.cidexit)
        connect(data)

        annotatePt(data,tabbyInd)
        drawData(lightcurveData,tabbyInd)
        #fig.savefig('Plots/Q16_PCA_kmeans/Tabby.png')
        canvas.draw()
        canvas.show()
    print("Plotting.")

    redraw() # First draw, Tabby plotted


    def quit():
        print("Exitting.")
        root.quit()
        root.destroy()

    Tk.Button(root, text="Quit", command=quit).pack()
else:
    print("Run cell above")
root.mainloop()