In [1]:
# Load libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
import time

In [2]:
def get_results(tar_test, pred, train_time, pred_time):
    '''
    Function used to return the analysis results, and the required time as a list
    for better storing and referencing in the various cases.
    The times are returned using gmtime, for better printing
    '''
    #calculate metrics
    acc = metrics.accuracy_score(tar_test, pred)
    prec = metrics.precision_score(tar_test, pred)
    f1 = metrics.f1_score(tar_test, pred)

    return [acc, prec, f1, time.gmtime(train_time), time.gmtime(pred_time)]

In [3]:
#define a list with the column names
col_names = ["class_label", "lepton_pT", "lepton_eta", "lepton_phi", "missing_energy_magnitude", "missing_energy_phi", "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b-tag", "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b-tag", "jet_3_pt", "jet_3_eta", "jet_3_phi", "jet_3_b-tag", "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b-tag", "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb"]

#read the scv 
higgs = pd.read_csv('HIGGS.csv', header=None, names=col_names)

In [4]:
#show the first 5 lines of the dataframe
higgs.head(5)

Unnamed: 0,class_label,lepton_pT,lepton_eta,lepton_phi,missing_energy_magnitude,missing_energy_phi,jet_1_pt,jet_1_eta,jet_1_phi,jet_1_b-tag,...,jet_4_eta,jet_4_phi,jet_4_b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [5]:
print("number of rows: ", len(higgs.index))

number of rows:  11000000


In [6]:
#define the feature and the target 
feat_cols = col_names[1:]
tar_col = col_names[0]

#get the featutre and the target dataframes
feats = higgs[feat_cols]
tar = higgs[tar_col]

In [7]:
#define the test and train dataset
#we assume 25% is the test size, the rest is for training
feat_train, feat_test, tar_train, tar_test = train_test_split(feats, tar, test_size=0.25, random_state=10)

In [8]:
depths = [5, 10, 15, 20]

decision_tree_results = []
for depth in depths:
    #initialize the Decision Tree Classifier
    tree_clf = DecisionTreeClassifier(max_depth=depth)

    #to calculate time spent to train the model
    start_time = time.time()
    #train the classifier
    tree_clf = tree_clf.fit(feat_train, tar_train)

    inter_time = time.time()
    train_time = inter_time-start_time
    #compute the predictions
    pred = tree_clf.predict(feat_test)

    pred_time = time.time()-inter_time

    #save them to a list for better referencing
    tree_results = get_results(tar_test, pred, train_time, pred_time)
    tree_results.append(depth)
    decision_tree_results.append(tree_results)

In [9]:
for i in range(len(decision_tree_results)):
    tree_result = decision_tree_results[i]
    depth = depths[i]

    print('==============================================================')
    print('maximun depth : ', tree_result[5])
    #print metrics
    print("Decision Tree Accuracy : {:5.3f}".format(tree_result[0]))
    print("Decision Tree Precision : {:5.3f}".format(tree_result[1]))
    print("Decision Tree F1 Score : {:5.3f}".format(tree_result[2]))

    #print the time needed
    print("Time spent training the Decision Tree classifier : ", time.strftime("%H:%M:%S", tree_result[3]))
    print("Time spent calcuating the Decision Tree predictions : ", time.strftime("%H:%M:%S", tree_result[4])) 

maximun depth :  5
Decision Tree Accuracy : 0.664
Decision Tree Precision : 0.676
Decision Tree F1 Score : 0.690
Time spent training the Decision Tree classifier :  00:03:39
Time spent calcuating the Decision Tree predictions :  00:00:01
maximun depth :  10
Decision Tree Accuracy : 0.705
Decision Tree Precision : 0.721
Decision Tree F1 Score : 0.721
Time spent training the Decision Tree classifier :  00:06:05
Time spent calcuating the Decision Tree predictions :  00:00:00
maximun depth :  15
Decision Tree Accuracy : 0.719
Decision Tree Precision : 0.734
Decision Tree F1 Score : 0.735
Time spent training the Decision Tree classifier :  00:08:33
Time spent calcuating the Decision Tree predictions :  00:00:00
maximun depth :  20
Decision Tree Accuracy : 0.711
Decision Tree Precision : 0.726
Decision Tree F1 Score : 0.728
Time spent training the Decision Tree classifier :  00:10:31
Time spent calcuating the Decision Tree predictions :  00:00:01


In [10]:
number_of_estimators = [5, 10, 15, 20]

random_forest_results = []
for estimators in number_of_estimators:
    #initialize the Random Forest Classifier with the specified number of estimators
    rf_clf = RandomForestClassifier(n_estimators=estimators)

    #to calculate time spent to train the model
    start_time = time.time()
    #train the classifier
    rf_clf = rf_clf.fit(feat_train, tar_train)

    inter_time = time.time()
    train_time = inter_time-start_time
    #compute the predictions
    pred = rf_clf.predict(feat_test)

    ppred_time = time.time()-inter_time

    #save them to a list for better referencing
    rf_results = get_results(tar_test, pred, train_time, pred_time)
    rf_results.append(estimators)
    random_forest_results.append(rf_results)

In [11]:
for rf_result in random_forest_results:
    print('==============================================================')
    print('number of estimators : ', rf_result[5])
    #print metrics
    print("Random Forest Accuracy : {:5.3f}".format(rf_result[0]))
    print("Random Forest Precision : {:5.3f}".format(rf_result[1]))
    print("Random Forest F1 Score : {:5.3f}".format(rf_result[2]))

    #print the time needed
    print("Time spent training the Random Forest classifier : ", time.strftime("%H:%M:%S", rf_result[3]))
    print("Time spent calcuating the Random Forest predictions : ", time.strftime("%H:%M:%S", rf_result[4])) 

number of estimators :  5
Random Forest Accuracy : 0.705
Random Forest Precision : 0.720
Random Forest F1 Score : 0.722
Time spent training the Random Forest classifier :  00:09:02
Time spent calcuating the Random Forest predictions :  00:00:01
number of estimators :  10
Random Forest Accuracy : 0.721
Random Forest Precision : 0.762
Random Forest F1 Score : 0.723
Time spent training the Random Forest classifier :  00:18:40
Time spent calcuating the Random Forest predictions :  00:00:01
number of estimators :  15
Random Forest Accuracy : 0.734
Random Forest Precision : 0.746
Random Forest F1 Score : 0.751
Time spent training the Random Forest classifier :  00:27:50
Time spent calcuating the Random Forest predictions :  00:00:01
number of estimators :  20
Random Forest Accuracy : 0.738
Random Forest Precision : 0.765
Random Forest F1 Score : 0.747
Time spent training the Random Forest classifier :  00:37:14
Time spent calcuating the Random Forest predictions :  00:00:01


In [12]:
gb_number_of_estimators = [5,10,15,20]

gb_depth5_results = []
for estimators in gb_number_of_estimators:
    #initialize the Gradient Boosted Classifier with the specified number of estimators and with max depth of each equal to 5
    gb_clf = GradientBoostingClassifier(n_estimators=estimators, max_depth=5)

    #to calculate time spent to train the model
    start_time = time.time()
    #train the classifier
    gb_clf = gb_clf.fit(feat_train, tar_train)

    inter_time = time.time()
    train_time = inter_time-start_time
    #compute the predictions
    pred = gb_clf.predict(feat_test)

    pred_time = time.time()-inter_time
    #save them to a list for better referencing
    gb_results = get_results(tar_test, pred, train_time, pred_time)
    gb_results.append(estimators)
    gb_depth5_results.append(gb_results)

In [13]:
for gb_result in gb_depth5_results:
    print('==============================================================')
    print('number of estimators : ', gb_result[5])
    print('max depth fixed to 5')
    #print metrics
    print("Gradient Boosted Accuracy : {:5.3f}".format(gb_result[0]))
    print("Gradient Boosted Precision : {:5.3f}".format(gb_result[1]))
    print("Gradient Boosted F1 Score : {:5.3f}".format(gb_result[2]))

    #print the time needed
    print("Time spent training the Gradient Boosted classifier : ", time.strftime("%H:%M:%S", gb_result[3]))
    print("Time spent calcuating the Gradient Boosted predictions : ", time.strftime("%H:%M:%S", gb_result[4])) 

number of estimators :  5
max depth fixed to 5
Gradient Boosted Accuracy : 0.674
Gradient Boosted Precision : 0.667
Gradient Boosted F1 Score : 0.714
Time spent training the Gradient Boosted classifier :  00:11:33
Time spent calcuating the Gradient Boosted predictions :  00:00:03
number of estimators :  10
max depth fixed to 5
Gradient Boosted Accuracy : 0.687
Gradient Boosted Precision : 0.690
Gradient Boosted F1 Score : 0.716
Time spent training the Gradient Boosted classifier :  00:21:44
Time spent calcuating the Gradient Boosted predictions :  00:00:01
number of estimators :  15
max depth fixed to 5
Gradient Boosted Accuracy : 0.698
Gradient Boosted Precision : 0.707
Gradient Boosted F1 Score : 0.721
Time spent training the Gradient Boosted classifier :  00:31:52
Time spent calcuating the Gradient Boosted predictions :  00:00:02
number of estimators :  20
max depth fixed to 5
Gradient Boosted Accuracy : 0.705
Gradient Boosted Precision : 0.715
Gradient Boosted F1 Score : 0.726
Time

In [14]:
gb_max_depth = [2,4,6,8]

gb_estim10_results = []
for depth in gb_max_depth:
    #initialize the Gradient Boosted Classifier with various max depth and number of estimators fixed to 8
    gb_clf = GradientBoostingClassifier(n_estimators=8, max_depth=depth)

    #to calculate time spent to train the model
    start_time = time.time()
    #train the classifier
    gb_clf = gb_clf.fit(feat_train, tar_train)

    inter_time = time.time()
    train_time = inter_time-start_time
    #compute the predictions
    pred = gb_clf.predict(feat_test)

    pred_time = time.time()-inter_time
    #save them to a list for better referencing
    gb_results = get_results(tar_test, pred, train_time, pred_time)
    gb_results.append(depth)
    gb_estim10_results.append(gb_results)

In [15]:
for gb_result in gb_estim10_results:
    print('==============================================================')
    print('number of estimators fixed to 10')
    print('max depth : ', gb_result[5])
    #print metrics
    print("Gradient Boosted Accuracy : {:5.3f}".format(gb_result[0]))
    print("Gradient Boosted Precision : {:5.3f}".format(gb_result[1]))
    print("Gradient Boosted F1 Score : {:5.3f}".format(gb_result[2]))

    #print the time needed
    print("Time spent training the Gradient Boosted classifier : ", time.strftime("%H:%M:%S", gb_result[3]))
    print("Time spent calcuating the Gradient Boosted predictions : ", time.strftime("%H:%M:%S", gb_result[4])) 

number of estimators fixed to 10
max depth :  2
Gradient Boosted Accuracy : 0.639
Gradient Boosted Precision : 0.640
Gradient Boosted F1 Score : 0.682
Time spent training the Gradient Boosted classifier :  00:04:21
Time spent calcuating the Gradient Boosted predictions :  00:00:00
number of estimators fixed to 10
max depth :  4
Gradient Boosted Accuracy : 0.673
Gradient Boosted Precision : 0.676
Gradient Boosted F1 Score : 0.704
Time spent training the Gradient Boosted classifier :  00:11:06
Time spent calcuating the Gradient Boosted predictions :  00:00:01
number of estimators fixed to 10
max depth :  6
Gradient Boosted Accuracy : 0.697
Gradient Boosted Precision : 0.701
Gradient Boosted F1 Score : 0.723
Time spent training the Gradient Boosted classifier :  00:29:06
Time spent calcuating the Gradient Boosted predictions :  00:00:01
number of estimators fixed to 10
max depth :  8
Gradient Boosted Accuracy : 0.710
Gradient Boosted Precision : 0.713
Gradient Boosted F1 Score : 0.736
Tim