In [64]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
from numpy import genfromtxt
import numpy as np
import csv
from collections import Counter

In [65]:
training_data = genfromtxt('training.csv',dtype = str, delimiter=',')
testing_data = genfromtxt('testing.csv',dtype = str, delimiter=',')

In [72]:
train_labels = training_data[1:,-1]
train_labels = (train_labels == 's').astype(np.int32)
train_weights = training_data[1:,31].astype(np.float32)
train_data = training_data[1:,1:31]
train_data = train_data.astype(np.float32)

In [73]:
test_labels = testing_data[1:,-1]
test_labels = (test_labels == 's').astype(np.int32)
test_weights = testing_data[1:,31].astype(np.float32)
test_data = testing_data[1:,1:31]
test_data = test_data.astype(np.float32)

In [74]:
def normalize_data(data):
    scaler = StandardScaler()
    scaler.fit(data)
    data = scaler.transform(data)
    return data

def replace_missing_values(data):
    a, b = np.where(data == -999)
    new_r, new_c = np.where(data != -999)
    mean_list = np.mean(data[new_r][new_c], axis = 0)
    for i in range(len(a)):
        data[a[i]][data[a[i]] == -999] = mean_list[b[i]]
    x, y = np.where(data == -999)
    print(x,y)
    return data
    
def run_pca(data):
    pca = PCA(n_components=25)
    pca.fit(data)
    data = pca.transform(data)
    return data

In [69]:
train_data = replace_missing_values(train_data)
test_data = replace_missing_values(test_data)

train_data = normalize_data(train_data)
test_data = normalize_data(test_data)

train_data = run_pca(train_data)
test_data = run_pca(test_data)

print(train_data.shape)
print(test_data.shape)

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))
(250000, 30)
(618237, 30)


In [7]:
import math
def calc_ams(w,y,p):
    w = np.asarray(w)
    y = np.asarray(y)
    p = np.asarray(p)
    y_signal = w * (y == 1)
    y_background = w * (y == 0)
    s = np.sum(y_signal * (p == 1))
    b = np.sum(y_background * (p == 1))
    b_r=10.0
    a = np.sqrt( 2 * ((s + b + b_r) * math.log ( 1.0 + (s / (b + b_r) ) ) - s ))
    return a

In [8]:
from sklearn.metrics import precision_score, recall_score
def precision(labels, pred):
    precision = precision_score(labels,pred)
    return precision

def recall(labels, pred):
    recall = recall_score(labels,pred)
    return recall

In [9]:
from sklearn.linear_model import LogisticRegression
def run_lr(train_data,train_labels,test_data,test_labels,weights):
    clf = LogisticRegression(random_state=0, solver='lbfgs')
    clf = clf.fit(train_data, train_labels)
    preds = clf.predict_proba(test_data)[:,1]
    preds[preds<0.6] = 0
    preds[preds>=0.6] = 1
    print(Counter(preds))
    print("Logistic Regression:")
    print("Accuracy: ", clf.score(test_data,test_labels))
    print("AMS score: ", calc_ams(weights.astype(np.float32),test_labels,preds))
    print("Precision: ", precision(test_labels,preds))
    print("Recall: ", recall(test_labels,preds))

In [10]:
from sklearn.naive_bayes import GaussianNB

def run_gnb(train_data,train_labels,test_data,test_labels,weights):
    clf = GaussianNB()
    clf = clf.fit(train_data, train_labels)
    preds = clf.predict_proba(test_data)[:,1]
    preds[preds<0.6] = 0
    preds[preds>=0.6] = 1
    print("Gaussian Naive Bayes:")
    print("Accuracy: ", clf.score(test_data,test_labels))
    print("AMS score: ", calc_ams(weights.astype(np.float32),test_labels,preds))
    print("Precision: ", precision(test_labels,preds))
    print("Recall: ", recall(test_labels,preds))

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

def run_gradient_boosting(train_data,train_labels,test_data,test_labels,weights):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    clf = clf.fit(train_data, train_labels)
    preds = clf.predict(test_data)
    preds[preds<0.6] = 0
    preds[preds>=0.6] = 1
    print("Gradient Boosting Classifier:")
    print("Accuracy: ", clf.score(test_data,test_labels))
    print("AMS score: ", calc_ams(weights.astype(np.float32),test_labels,preds))
    print("Precision: ", precision(test_labels,preds))
    print("Recall: ", recall(test_labels,preds))

In [12]:
from sklearn.tree import DecisionTreeClassifier

def run_decision_tree(train_data,train_labels,test_data,test_labels,weights):
    clf = DecisionTreeClassifier(random_state=0)
    clf = clf.fit(train_data, train_labels)
    preds = clf.predict(test_data)
    preds[preds<0.6] = 0
    preds[preds>=0.6] = 1
    print("Decision Tree Classifier:")
    print("Accuracy: ", clf.score(test_data,test_labels))
    print("AMS score: ", calc_ams(weights.astype(np.float32),test_labels,preds))
    print("Precision: ", precision(test_labels,preds))
    print("Recall: ", recall(test_labels,preds))

In [13]:
from xgboost import XGBClassifier
def run_xgboost(train_data,train_labels,test_data,test_labels,weights):
    clf = XGBClassifier()
    clf = clf.fit(train_data, train_labels)
    preds = clf.predict(test_data)
    preds[preds<0.6] = 0
    preds[preds>=0.6] = 1
    print("XG Boost Classifier:")
    print("Accuracy: ", clf.score(test_data,test_labels))
    print("AMS score: ", calc_ams(weights.astype(np.float32),test_labels,preds))
    print("Precision: ", precision(test_labels,preds))
    print("Recall: ", recall(test_labels,preds))

In [32]:
run_lr(train_data,train_labels,test_data,test_labels,test_weights)

Counter({0.0: 529961, 1.0: 88276})
Logistic Regression:
('Accuracy: ', 0.73834629761725679)
('AMS score: ', 1.380013988818052)
('Precision: ', 0.72985862522089806)
('Recall: ', 0.30526969145629596)


In [15]:
run_gnb(train_data,train_labels,test_data,test_labels,test_weights)

Gaussian Naive Bayes:
('Accuracy: ', 0.62865535385297222)
('AMS score: ', 0.35172904297859114)
('Precision: ', 0.44565457207613773)
('Recall: ', 0.21055075430217574)


In [16]:
run_gradient_boosting(train_data,train_labels,test_data,test_labels,test_weights)

Gradient Boosting Classifier:
('Accuracy: ', 0.59139779728485997)
('AMS score: ', 0.25769452732689641)
('Precision: ', 0.36614809804489967)
('Recall: ', 0.26930767189750587)


In [17]:
run_decision_tree(train_data,train_labels,test_data,test_labels,test_weights)

Decision Tree Classifier:
('Accuracy: ', 0.56762212549556235)
('AMS score: ', 0.41932308538968016)
('Precision: ', 0.36444729309032026)
('Recall: ', 0.35831722386475628)


In [18]:
run_xgboost(train_data,train_labels,test_data,test_labels,test_weights)

XG Boost Classifier:
('Accuracy: ', 0.63250015770651058)
('AMS score: ', 0.31949904847195404)
('Precision: ', 0.44273737072817809)
('Recall: ', 0.29574141460086423)


In [None]:
# calculate Pearson's correlation
from scipy.stats import pearsonr
n = len(train_data[0])
dict1={}
for i in range(1,n-2):
    l= []
    for j in range(1,n-2):
        corr, _ = pearsonr(train_data[:,i], train_data[:,j])
        l.append(corr)
    dict1[training_data[0][i]]=l

dataframe = pd.DataFrame.from_dict(dict1, orient='index',columns=training_data[0][1:n-2])


#check for relations where there is 0.95 correlation
dict2={}
for key in dict1:
    n = len(dict1[key])
    for i in range(0,n):
        if dict1[key][i]>=0.95 and key!=training_data[0][i+1]:
            if key>training_data[0][i+1]:
                a = training_data[0][i+1]
                b = key
            else:
                a = key
                b = training_data[0][i+1]
            dict2[(a,b)]=dict1[key][i]

for key in dict2:
    print (key,dict2[key])

In [None]:
dataCsv = dataframe.to_csv(index=True)
f = open('correlation.csv','w+')
f.write(dataCsv)

In [None]:
# Plot of DER_mass_MC
import random
import numpy
from matplotlib import pyplot

filename='training.csv'
data = genfromtxt(filename,dtype = float, delimiter=',');
data1 = genfromtxt(filename,dtype = str, delimiter=',');
Mass = data[1:,1]
Y = data1[1:,32]
Y = np.array(Y)

Mass = np.array(Mass)
Msignal = Mass[Y=='s'];
Mbackground = Mass[Y=='b'];
bins = numpy.linspace(9, 500, 50)
pyplot.hist(Mbackground, bins, alpha=0.5, label='Background')
pyplot.hist(Msignal, bins, alpha=0.5, label='Signal')
pyplot.legend(loc='upper right')
pyplot.xlabel('DER_mass_MMC')
pyplot.ylabel('Number of events')
pyplot.show()

In [None]:
from sklearn.svm import SVC
clf = SVC(probability=True)
clf.fit(train_data[:200000], train_labels[:200000])
clf.score(train_data[200000:], train_labels[200000:])

In [None]:
#experimenting on different parameters of svm
from sklearn import svm
X,Y = train_data,train_labels
W = training_data[1:,31]
W = W.astype(np.float64)

X1 = X[:100000];
W1 = W[:100000];
Y1 = Y[:100000];
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['linear', 'rbf', 'poly']

print("\nExperimenting for different kernels")
for kernel in kernels:
    svc= svm.SVC(kernel=kernel).fit(X1,Y1)
#     plotSVC('kernel=' + str(kernel))
    svmScore = svc.score(X[100000:200000],Y[100000:200000])
    print("Kernel ", kernel," Svm score for test data:",svmScore)
    
print("\nExperimenting for different gammas")
for gamma in gammas:
    svc = svm.SVC(kernel='rbf', gamma=gamma).fit(X, Y)
    svmScore = svc.score(X[100000:200000],Y[100000:200000])
#     plotSVC(‘gamma=’ + str(gamma))
    print("Gamma:", gamma," Svm score for test data:",svmScore)
    
print("\nExperimenting for different Regularization parameters")
for c in Cs:
    svc = svm.SVC(kernel='rbf', C=c).fit(X, Y)
    svmScore = svc.score(X[100000:200000],Y[100000:200000])
#     plotSVC(‘C=’ + str(c))
    print("C:", c," Svm score for test data:",svmScore)

print("\nExperimenting for different Degrees for polynomial kernel")
degrees = [0, 1, 2, 3, 4]
for degree in degrees:
    svc = svm.SVC(kernel='poly', degree=degree).fit(X, Y)
    svmScore = svc.score(X[100000:200000],Y[100000:200000])
    print("Degree ", degree," Svm score for test data:",svmScore)
    

In [75]:
def process_data_jet_num(train_data,train_labels,num,weights):
    print(Counter(train_data[:,22]))
    if num == 0:
        data = train_data[train_data[:,22] == 0]
        data = np.delete(data,[4,5,6,12,22,23,24,25,26,27,28],1)
        labels = train_labels[np.where(train_data[:,22] == 0)]
        weights = weights[np.where(train_data[:,22] == 0)]
    elif num == 1:
        data = train_data[train_data[:,22] == 1]
        data = np.delete(data,[4,5,6,12,26,27,28,22],1)
        labels = train_labels[np.where(train_data[:,22] == 1)]
        weights = weights[np.where(train_data[:,22] == 1)]
    else:
        data_2 = train_data[train_data[:,22] == 2]
        data_2 = np.delete(data_2,[22],1)
        labels_2 = train_labels[np.where(train_data[:,22] == 2)]
        labels_2_weights = weights[np.where(train_data[:,22] == 2)]

        data_3 = train_data[train_data[:,22] == 3]
        data_3 = np.delete(data_3,[22],1)
        labels_3 = train_labels[np.where(train_data[:,22] == 3)]
        labels_3_weights = weights[np.where(train_data[:,22] == 3)]
        
        data = np.concatenate((data_2,data_3),axis=0)
        labels = np.concatenate((labels_2,labels_3),axis=0)
        weights = np.concatenate((labels_2_weights,labels_3_weights),axis=0)
    return data, labels, weights

In [76]:
data_jet_num_0, data_jet_num_0_labels, data_jet_num_0_weights = process_data_jet_num(train_data,train_labels,0,train_weights)
data_jet_num_0 = normalize_data(data_jet_num_0)


data_jet_num_1, data_jet_num_1_labels, data_jet_num_1_weights = process_data_jet_num(train_data,train_labels,1,train_weights)
data_jet_num_1 = normalize_data(data_jet_num_1)

data_jet_num_2_3, data_jet_num_2_3_labels, data_jet_num_2_3_weights = process_data_jet_num(train_data,train_labels,2,train_weights)
data_jet_num_2_3 = normalize_data(data_jet_num_2_3)

data_jet_num_0_test, data_jet_num_0_test_labels, data_jet_num_0_test_weights = process_data_jet_num(test_data,test_labels,0,test_weights)
data_jet_num_0_test = normalize_data(data_jet_num_0_test)

data_jet_num_1_test, data_jet_num_1_test_labels, data_jet_num_1_test_weights = process_data_jet_num(test_data,test_labels,1,test_weights)
data_jet_num_1_test = normalize_data(data_jet_num_1_test)

data_jet_num_2_3_test, data_jet_num_2_3_test_labels, data_jet_num_2_3_test_weights = process_data_jet_num(test_data,test_labels,2,test_weights)
data_jet_num_2_3_test = normalize_data(data_jet_num_2_3_test)


run_lr(data_jet_num_0,data_jet_num_0_labels,data_jet_num_0_test,data_jet_num_0_test_labels,data_jet_num_0_test_weights)
run_lr(data_jet_num_1,data_jet_num_1_labels,data_jet_num_1_test,data_jet_num_1_test_labels,data_jet_num_1_test_weights)
run_lr(data_jet_num_2_3,data_jet_num_2_3_labels,data_jet_num_2_3_test,data_jet_num_2_3_test_labels,data_jet_num_2_3_test_weights)

Counter({0.0: 99913, 1.0: 77544, 2.0: 50379, 3.0: 22164})
Counter({0.0: 99913, 1.0: 77544, 2.0: 50379, 3.0: 22164})
Counter({0.0: 99913, 1.0: 77544, 2.0: 50379, 3.0: 22164})
Counter({0.0: 247404, 1.0: 190799, 2.0: 124769, 3.0: 55265})
Counter({0.0: 247404, 1.0: 190799, 2.0: 124769, 3.0: 55265})
Counter({0.0: 247404, 1.0: 190799, 2.0: 124769, 3.0: 55265})
Counter({0.0: 212705, 1.0: 34699})
Logistic Regression:
('Accuracy: ', 0.82562933501479363)
('AMS score: ', 1.6537014145576556)
('Precision: ', 0.75466151762298628)
('Recall: ', 0.41919062560030734)
Counter({0.0: 163531, 1.0: 27268})
Logistic Regression:
('Accuracy: ', 0.71278151352994512)
('AMS score: ', 0.65372817682042028)
('Precision: ', 0.67041953938682708)
('Recall: ', 0.2688659126674805)
Counter({0.0: 125856, 1.0: 54178})
Logistic Regression:
('Accuracy: ', 0.73471677572014171)
('AMS score: ', 0.77862687011889753)
('Precision: ', 0.78496806821957255)
('Recall: ', 0.5276754141075749)


In [None]:
def 