In [3]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import torch.utils.data as utils
from mpl_toolkits.mplot3d import Axes3D

In [38]:
def get_real_data(n_events):
    '''
    read from csv files
    returns n_events of each class
    '''
    qcd_data_original = np.genfromtxt('../HiggsReconstruction/EventPlotting/qcd_outputDataForLearning.csv', skip_header=1, delimiter=",")
    hh_data_original  = np.genfromtxt('../HiggsReconstruction/EventPlotting/dihiggs_outputDataForLearning.csv', skip_header=1, delimiter=",")
    
    # # getting rid of "isMatchable" columns and "Btag Jet1234" columns
    # qcd_data_original = np.delete(qcd_data_original, [17, 53, 52, 51, 50], 1)
    # hh_data_original  = np.delete(hh_data_original,  [17, 53, 52, 51, 50], 1)
    
    qcd_data_original = qcd_data_original[:n_events,:]
    hh_data_original  =  hh_data_original[:n_events,:]
    
    
    return hh_data_original, qcd_data_original

def add_labels_and_data(hh_data, qcd_data, n_rows, n_columns, iteration=None):
    '''
    create label column for qcd (0) and hh (1)
    select certain columns of interest - iteration is a list of column indexes
    returns one np array of the concatenation of the 2 datasets. Add m columns
    of zeros where m = (n_rows*n_cols)-n_features. 
    When iteration is true, assumes iteration includes labels.
    '''
    n_features = len(iteration)-1
    m = (n_rows*n_columns)-n_features
#     print("m =", m)
    
    # generate labels for hh and qcd
    hh_labels= np.ones((len(hh_data),1))
    hh_data_original = np.append(hh_data, hh_labels, axis=1)

    qcd_labels= np.zeros((len(qcd_data),1))
    qcd_data_original = np.append(qcd_data, qcd_labels, axis=1)
    
    # add all data together
    all_data_original = np.append(hh_data_original, qcd_data_original, axis=0)
    
    if iteration is not None:
        all_data = all_data_original[:,iteration]
#         print("m in loop", m)
        print(np.shape(all_data))
        for i in range(m):
            all_data = np.insert(all_data, -1, np.zeros([1,1]), axis=1)
            print(np.shape(all_data))
    else:
        all_data = all_data_original
    
    return all_data



In [39]:
# image size
image_rows = 3
image_cols = 3
channels   = 1

hh, qcd = get_real_data(1700)
all_data = add_labels_and_data(hh, qcd, image_rows, image_cols, iteration=[1,2,6,7,8,9,10,11,54])


y = all_data[:,-1]
X    = all_data[:,:-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

# change input dimensions so it has 4 dimension tensor instead of 2
X_train     = np.reshape(X_train, (X_train.shape[0],image_rows,image_cols,channels))
X_test      = np.reshape(X_test,   (X_test.shape[0],image_rows,image_cols,channels))




(3400, 9)
(3400, 10)
