In [1]:
## GTI770 - TP2

In [2]:
%matplotlib inline   
import matplotlib.pyplot as plt
import numpy as np
import os
import cv2
import graphviz
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [5]:
# Get some images
fid = open('galaxy_feature_vectors.csv', 'r') 

NbGalaxy = 100
Galaxies = np.zeros((NbGalaxy, 76), dtype=float)

count = 0
count_smooth = 0
count_spiral = 0

for line in fid:
    element = line.rstrip('\n').split(',')
    
    label = float(element[75])
    
    if label == 0.0 and count_smooth < NbGalaxy/2:
        count_smooth += 1     
    elif label == 1.0 and count_spiral < NbGalaxy/2:
        count_spiral += 1
    else:
        continue
    
    Galaxies[count] = element
    
    count += 1
    if count >= NbGalaxy:
        break

fid.close() 

# ----- For debug -----
#print(count_smooth)
#print(count_spiral)
#print(count)
print(Galaxies)
#----------------------

[[ 3.62952000e+05  4.39309241e+01  5.10145966e+01 ... -2.64805345e-16
   5.57636020e-24  1.00000000e+00]
 [ 8.30956000e+05  3.85109441e+01  5.00756838e+01 ...  1.37293801e-17
   4.81001774e-27  0.00000000e+00]
 [ 7.26733000e+05  2.81946369e+01  4.36395794e+01 ...  4.75665983e-18
   2.00633541e-26  0.00000000e+00]
 ...
 [ 9.38312000e+05  4.80393193e+01  5.06821972e+01 ...  1.20763803e-15
   5.93901359e-24  1.00000000e+00]
 [ 6.54409000e+05  3.76624602e+01  4.17980471e+01 ...  1.73016813e-15
  -4.60150791e-23  1.00000000e+00]
 [ 3.17194000e+05  2.75697417e+01  3.87609676e+01 ...  7.75480316e-15
  -3.21078697e-23  1.00000000e+00]]


In [4]:
# Get some mails
fid = open('spam.csv', 'r') 

NbMails = 100
Filter = np.zeros((NbMails, 58), dtype=float)

count = 0
count_spam = 0
count_mail = 0

for line in fid:
    element = line.rstrip('\n').split(',')
    
    if element[57] == '0' and count_mail < NbMails/2:
        count_mail += 1     
    elif element[57] == '1' and count_spam < NbMails/2:
        count_spam += 1
    else:
        continue
    
    Filter[count] = element
    
    count += 1
    if count >= NbMails:
        break

fid.close() 

# ----- For debug -----
#print(count_spam)
#print(count_mail)
#print(count)
print(Filter)
#----------------------

[[0.000e+00 0.000e+00 0.000e+00 ... 1.200e+01 3.720e+02 0.000e+00]
 [0.000e+00 0.000e+00 2.940e+00 ... 4.000e+00 1.900e+01 0.000e+00]
 [0.000e+00 0.000e+00 1.490e+00 ... 5.000e+00 1.110e+02 0.000e+00]
 ...
 [2.200e-01 2.200e-01 2.200e-01 ... 1.038e+03 1.734e+03 1.000e+00]
 [2.700e-01 2.700e-01 2.700e-01 ... 6.400e+01 5.780e+02 1.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 5.700e+01 8.500e+01 1.000e+00]]


In [6]:
def SplitVectorDataTrainValTest(primitives_vector, train_portion):
    
    size = len(primitives_vector)
    
    val_portion = (1 - train_portion) / 2
    test_portion = (1 - train_portion) / 2

    nbTrain = int(size * train_portion)
    nbVal = int(size * val_portion)
    nbTest = int(size * test_portion)

    array_train = np.zeros((nbTrain, size), dtype=float)
    array_val = np.zeros((nbVal, size), dtype=float)
    array_test = np.zeros((nbTest, size), dtype=float)

    array_train = primitives_vector[:nbTrain]
    array_val = primitives_vector[nbTrain : nbTrain + nbVal]
    array_test = primitives_vector[-nbTest:]
    return array_train, array_val, array_test

def GenerateModelDataFromVector(array_train, array_val, array_test, num_features, chosen_model):
    
    num_features = num_features - 1
    
    data_train = array_train
    X_train_tree  = data_train[:,0:num_features]
    Y_train_tree  = data_train[:,num_features]
    
    data_val = array_val
    X_val_tree  = data_val[:,0:num_features]
    Y_val_tree  = data_val[:,num_features]
    
    data_test = array_test
    X_test_tree  = data_test[:,0:num_features]
    Y_test_tree  = data_test[:,num_features]
    
    
    model = chosen_model()
    model = model.fit(X_train_tree, Y_train_tree)
    prediction_train = model.predict(X_train_tree)
    prediction_val = model.predict(X_val_tree)
    prediction_test = model.predict(X_test_tree)
    return model, prediction_train, prediction_val, prediction_test, Y_train_tree, Y_val_tree, Y_test_tree

In [7]:
def DecisionTree():
    return tree.DecisionTreeClassifier(criterion='entropy', max_depth=9, min_samples_leaf=6)

In [8]:
# Export Tree

array_train, array_val, array_test = SplitVectorDataTrainValTest(Filter, 0.6)
model, prediction_train, prediction_val, prediction_test, Y_train_tree, Y_val_tree, Y_test_tree = GenerateModelDataFromVector(array_train, array_val, array_test,len(Filter[0]), DecisionTree)

def ShowAccScore(Y_tree, prediction, name):
    print("Accuracy score for %s values = %f" %(name, accuracy_score(Y_tree, prediction)))

def ExportTree():
    dot_data = tree.export_graphviz(model, out_file=None, 
                         feature_names = ['ratio longueur/largeur', 'pixels blanches grayscale', 'pixels bleu'],  
                         class_names = ['smooth', 'spiral'],
                         filled=True, rounded=True,  
                         special_characters=True)  
    graph = graphviz.Source(dot_data)
    graph.format = 'png'
    graph.render("Galaxy_data") 
    return graph

#ExportTree()

In [9]:
ShowAccScore(Y_train_tree, prediction_train, "train")
ShowAccScore(Y_val_tree, prediction_val, "validation") 
ShowAccScore(Y_test_tree, prediction_test, "test")
#ExportTree()

Accuracy score for train values = 0.883333
Accuracy score for validation values = 0.750000
Accuracy score for test values = 0.500000
