In [1]:
#I just load in everything, delete stuff if you want
import random
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold,RepeatedKFold, GridSearchCV,  RandomizedSearchCV
import math
from data import *
import pandas as pd
from itertools import islice
import seaborn as sns; sns.set_theme()
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

TODO based on importance:
- Create separate models for system types
- Add SNR to images?
- Sample Train/Test split better to maintain class distribution
- Cross Validation
- Try out more Multivariate Regression (bc continuous & multi-dimensional)

In [None]:
df = pd.read_hdf('/Users/lukemcdermott/Desktop/Physics/spectral_templates_data_version_june20.h5', key = '/binaries')
bin_df = pd.read_hdf('/Users/lukemcdermott/Desktop/Physics/spectral_templates_data_version_june20.h5', key = '/binaries')
df = df.loc[df['primary_type'] <= df['secondary_type']]
display(df)
#x = np.insert(np.arange(441), 0, -1)   #Uncomment to add system type
x = np.arange(441)
images = df.iloc[:, x].to_numpy()
labels = df.iloc[:, 441:443].to_numpy()
labels_flat = np.zeros((len(labels)))
for idx, i in enumerate(labels):
    labels_flat[idx] = 24*(int(i[0]-16)) + int((i[1]-16))
#Convert 2D labels into 1D Discrete Classes
    
idx = np.random.choice(np.arange(len(images)), 35000, replace=False)
images_sample = images[idx]
labels_sample = labels_flat[idx]

# Split into training and testing sets
train_images, train_labels, test_images, test_labels = split_data(images_sample, labels_sample)

Regression Models

In [None]:
clf = LogisticRegression(max_iter = 1000, solver='lbfgs')
clf.fit(train_images, train_labels)
clf.score(test_images, test_labels)

Testing 1D Discrete Models

In [None]:
outputs_flat = clf.predict(test_images)
#unflatten
outputs = np.zeros((len(outputs_flat), 2)).astype(int)
lab = np.zeros((len(test_labels), 2)).astype(int)

for idx, val in enumerate(outputs_flat):
    #labels_flat[idx] = 24*(int(i[0]-16)) + int((i[1]-16))
    outputs[idx,0] = val // 24 + 16
    outputs[idx,1] = val % 24 + 16

for idx, val in enumerate(test_labels):
    #labels_flat[idx] = 24*(int(i[0]-16)) + int((i[1]-16))
    lab[idx,0] = val // 24 + 16
    lab[idx,1] = val % 24 + 16

diff = lab - outputs

#SMSE Loss
print('primary loss', math.sqrt(sklearn.metrics.mean_squared_error(lab[:,0], outputs[:,0])))
print('secondary loss', math.sqrt(sklearn.metrics.mean_squared_error(lab[:,1], outputs[:,1])))

In [None]:
predictions = {}

for idx, val in enumerate(outputs):
    try:
        predictions[lab[idx,0], lab[idx,1]].append(val)
    except:
        predictions[lab[idx,0], lab[idx,1]] = []

mean_pred = {}
for pair in predictions:
    predictions[pair] = np.array(predictions[pair])
    #print(np.shape(predictions[pair]))
    if len(predictions[pair]) != 0:
        mean_pred[pair] = np.mean(predictions[pair],axis = 0)
    else:
        print('No test values for:', pair)

In [None]:
def plot_pred(p, s):
    results = np.array(predictions[(p,s)])
    plt.figure(facecolor = 'white')
    plt.xticks(np.arange(16,40))
    plt.hist(results[:,0], range=[16,40], bins = 23, color = 'blue', alpha = .5, label = 'Primary Prediction')
    plt.hist(results[:,1], range=[16,40], bins = 23, color = 'red', alpha = .5, label = 'Secondary Prediction')
    plt.legend()
    plt.show()

def acc_map(results, title = 'Secondary Prediction Accuracy', annotation = False):
    plt.figure(figsize = (8,7), facecolor='white')
    ax = sns.heatmap(results, cmap = "mako", annot = annotation)
    labels = ['M6','M7','M8','M9','L0','L1','L2','L3','L4','L5','L6','L7','L8','L9','T0','T1','T2','T3','T4','T5','T6','T7','T8']
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    ax.patch.set_edgecolor('black')  
    ax.patch.set_linewidth('10') 
    ax.set_ylabel('Primary')
    ax.set_xlabel('Secondary')
    ax.set_title(title)

In [None]:
pri_mean = np.zeros((23,23))
sec_mean = np.zeros((23,23))

for primary in range(16,39):
    for secondary in range(primary, 39):
        try:
            mu = mean_pred[(primary,secondary)]
        except:
            mu = [0,0]
        pri_mean[primary-16,secondary-16] = mu[0]
        sec_mean[primary-16,secondary-16] = mu[1]
        
acc_map(pri_mean, 'Mean of Primary Predictions', annotation = True)
acc_map(sec_mean, 'Mean of Secondary Predictions', annotation = True)

In [None]:
pri_mean = np.zeros((23,23))
sec_mean = np.zeros((23,23))

for primary in range(16,39):
    for secondary in range(primary, 39):
        try:
            mu = abs(mean_pred[(primary,secondary)] - (primary,secondary))
        except:
            mu = [0,0]
        pri_mean[primary-16,secondary-16] = mu[0]
        sec_mean[primary-16,secondary-16] = mu[1]
        
acc_map(pri_mean, 'Mean Difference of Primary Predictions')
acc_map(sec_mean, 'Mean Difference of Secondary Predictions')