In [20]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import astropy
import json
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from collections import Counter

%matplotlib inline

In [6]:
#Retrieve Parameters
params = pd.read_csv("ML_param.csv")
#params.values
#params.iterrows()
del params['coord_dec'] #Irrelevant (Hopefully) and poorly formatted for ML
del params['coord_ra']
del params['id']
del params['parent']

In [9]:
#Retrieve Training Set Classifications
csv_path = "../classifications/lsst_run_one.csv"
clsfn = pd.read_csv(csv_path)
subset = clsfn.loc[clsfn['workflow_name'] == "Difference Imaging Classifier"]
np.shape(subset)
im_class = []
for item, row in subset.iterrows():
    s_data = json.loads(row.subject_data) #Subject Data
    s_data = s_data.get(s_data.keys()[0])
    im=s_data.get(s_data.keys()[0])[47:-4]
    a_data = json.loads(row.annotations)[0] #Annotations
    classification = a_data['value']
    im_class.append([int(im),classification])
im_class = sorted(im_class)
# im_class

In [10]:
#Generate Data Frame that holds classifications and respective measured quantities
#This is done as a dataframe is a more cohesive object for data analysis
d = {}
for key in params.keys():
    d[key] = []
d["Classification"]=[]
im_data = params.set_index('image').T.to_dict()
for im in im_class:
    d["Classification"].append(im[1])
    d["image"].append(im[0])
    dat_dict = im_data.get(im[0])
    for key in dat_dict:
        d[key].append(dat_dict.get(key))
df = pd.DataFrame(d)   

In [11]:
#Read in the features in the data frame, filter for columns relevant for ML
features = df.columns.tolist()

features = [c for c in features if c not in ["image", "Classification"]]
target = "Classification"  #Predict on Classifications

#Cut 'useless' features and problematic features
for feature in features:
    if df[feature].isnull().all():
        del df[feature]   
    elif np.mean(df[feature]) == np.inf:
        del df[feature]
    elif "flag" in feature: #Flags don't contribute to ML based on initial testing
        del df[feature]
        
features = df.columns.tolist()

features = [c for c in features if c not in ["image", "Classification"]]

In [12]:
imp = Imputer(missing_values='NaN', strategy='median', axis=0, verbose = 1)
imp.fit(df[features])
features_imp = imp.transform(df[features])

In [13]:
#Random Forest
RFC =  RandomForestClassifier()
RFC = RFC.fit(features_imp,df[target])

In [15]:
feature_importance = zip(RFC.feature_importances_, features)
# sorted(feature_importance, reverse=True)
# df.corr()

In [16]:
#Split into Training Set and Testing Set

train = df.sample(frac=0.8, random_state=1)
test = df.loc[~df.index.isin(train.index)]

imp_tt = Imputer(missing_values='NaN', strategy='median', axis=0, verbose = 1)
imp_tt.fit(train[features])
features_imp_tt = imp_tt.transform(train[features])

#Random Forest Training/Test Split
RFC_tt =  RandomForestClassifier()
RFC_tt = RFC.fit(features_imp_tt,train[target])

In [17]:
imp_test = Imputer(missing_values='NaN', strategy='median', axis=0, verbose = 1)
imp_test.fit(test[features])
features_imp_test = imp_test.transform(test[features])

predictions = RFC_tt.predict(features_imp_test)

In [18]:
for index in range(len(predictions)):
    print predictions[index],",",test["Classification"].tolist()[index]

In [19]:
for index in range(len(predictions)):
    if predictions[index] != test["Classification"].tolist()[index]:
        print predictions[index],",",test["Classification"].tolist()[index],",",test["image"].tolist()[index]

In [21]:
pred_mapping = {}
for index in range(len(predictions)):
    pred_name = predictions[index]
    class_name = test["Classification"].tolist()[index]
    if pred_name in pred_mapping:
        pred_mapping[pred_name].append(class_name)
    else:
        pred_mapping[pred_name] = [class_name]
        
for pred_name, pred_value in pred_mapping.iteritems():
    print pred_name, Counter(pred_value)

In [23]:
agree=0
disagree=0
for index in range(len(predictions)):
    if predictions[index] == test["Classification"].tolist()[index]:
        agree+=1
    else:
        disagree+=1
print "Agree: "+ str(agree), "Disagree: "+ str(disagree)