In [76]:
## for data
import pandas as pd
import numpy as np
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
## for machine learning
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
from sklearn.preprocessing import MinMaxScaler
## for explainer
from lime import lime_tabular
import seaborn as sns

In [3]:
df = pd.read_csv('training_set.csv')
df.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [65]:
meta = pd.read_csv('training_set_metadata.csv')
meta.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [66]:
targets = meta['target']

Columns: object id, Min Flux, Max Flux, Mean Flux, Mean Flux error, Min Flux error, Max Flux error, Mean Passband, Mean detected, target

In [69]:
##checking that id columns in training and metadata match before assigning targets
ids = df['object_id'].unique()
(meta['object_id']==ids).value_counts()

True    7848
Name: object_id, dtype: int64

In [51]:
#feature processing
res = []
j = 0
for i in ids:
    data = df[df['object_id'] == i]
    aggdata = data.agg({'passband':['mean'], 'flux': ['min','max','mean'], 'flux_err': ['min','max','mean'], 'detected':['mean']})
    unstacked = aggdata.unstack().dropna().array
    res.append(unstacked)
    j+=1
    if j%1000 == 0:
        print(j/len(ids)*100)

12.7420998980632
25.4841997961264
38.226299694189606
50.9683995922528
63.71049949031601
76.45259938837921
89.1946992864424


In [71]:
#wrangling dataframe
training_set = pd.DataFrame(res)
training_set['object_id'] = ids
training_set['target'] = targets
training_set.columns = ['passband_mean','flux_mean','flux_min','flux_max','flux_err_mean','flux_err_min','flux_err_max','detected_mean','object_id','target']
training_set = training_set.iloc[:,[8,0,1,2,3,4,5,6,7,9]]
training_set

Unnamed: 0,object_id,passband_mean,flux_mean,flux_min,flux_max,flux_err_mean,flux_err_min,flux_err_max,detected_mean,target
0,615,2.457386,-123.096998,-1100.440063,660.626343,4.482743,2.130510,12.845472,0.946023,92
1,713,2.400000,-1.423351,-14.735178,14.770886,2.359620,0.639458,9.115748,0.171429,88
2,730,2.336364,2.267434,-19.159811,47.310059,2.471061,0.695106,11.281384,0.069697,42
3,745,2.378917,8.909206,-15.494463,220.795212,2.555576,0.567170,55.892746,0.173789,90
4,1124,2.457386,7.145702,-16.543753,143.600189,2.753004,0.695277,11.383690,0.173295,90
...,...,...,...,...,...,...,...,...,...,...
7843,130739978,2.993197,6.786007,-105.375282,517.602478,16.391897,1.700991,72.230759,0.034014,65
7844,130755807,3.149123,16.466672,-69.036392,363.402466,18.477200,2.326328,71.215874,0.052632,90
7845,130762946,2.977612,-15.308645,-135.602631,169.916672,27.723353,5.974091,79.265930,0.052239,16
7846,130772921,3.138889,4.440758,-51.927830,322.255371,13.657375,1.286931,59.214134,0.020833,65


In [82]:
#scale data
scale = MinMaxScaler()
df = training_set
df[['passband_mean','flux_mean','flux_min','flux_max','flux_err_mean','flux_err_min','flux_err_max','detected_mean']] = scale.fit_transform(df[['passband_mean','flux_mean','flux_min','flux_max','flux_err_mean','flux_err_min','flux_err_max','detected_mean']])
df

Unnamed: 0,object_id,passband_mean,flux_mean,flux_min,flux_max,flux_err_mean,flux_err_min,flux_err_max,detected_mean,target
0,615,0.104004,0.253477,0.998599,0.000580,0.000022,0.005095,1.692928e-06,0.945714,92
1,713,0.054688,0.254123,0.999543,0.000315,0.000002,0.000537,2.344565e-08,0.166694,88
2,730,0.000000,0.254143,0.999539,0.000328,0.000003,0.000707,9.928179e-07,0.064381,42
3,745,0.036570,0.254178,0.999542,0.000400,0.000003,0.000316,2.096156e-05,0.169068,90
4,1124,0.104004,0.254169,0.999541,0.000368,0.000005,0.000708,1.038612e-06,0.168571,90
...,...,...,...,...,...,...,...,...,...,...
7843,130739978,0.564466,0.254167,0.999464,0.000522,0.000136,0.003782,2.827470e-05,0.028494,65
7844,130755807,0.698465,0.254218,0.999496,0.000458,0.000155,0.005693,2.782043e-05,0.047218,90
7845,130762946,0.551073,0.254050,0.999438,0.000379,0.000244,0.016843,3.142376e-05,0.046823,16
7846,130772921,0.689670,0.254154,0.999510,0.000441,0.000109,0.002516,2.244826e-05,0.015238,65


In [86]:
#converting labels to string values -- categories
df[['object_id','target']] = df[['object_id','target']].astype(str)
type(df['object_id'][0])

str

In [87]:
#preparing training and testing sets
X = df.drop(['object_id','target'], axis=1)
y = df['target']
from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=66)

In [90]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [92]:
np.mean(y_pred == y_test)

0.601427115188583

With preliminary Random Forest Classification, the algorithm correctly predicts the target 60% of the time. Since the are 17 possible categories of target, there is only a 5.88% chance of a correct prediction due to random chance