<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>
<h2>Membre du groupe</h2>
<ul>
    <li>Meriem AMERAOUI</li>
    <li>Dounia BELABIOD</li>
    <li>Jihene BOUHLEL</li>
    <li>Bahaa Eddine NIL</li>
</ul>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Executing the basic
    </h1>
</div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import pickle
import warnings

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Compte rendu de classification
def cpt_mal_classes(y_test_func, result_func):
    nb_func = 0
    for i in range(len(y_test_func)):
        if y_test_func[i] != result_func[i]:
            nb_func += 1
    print (f'Taille des données {len(y_test_func)} mal classés {nb_func}\n')

warnings.filterwarnings("ignore", category = FutureWarning)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Classification
    </h1>
</div>

## Reading the transformed data for the classification

In [None]:
df = pd.read_csv('attemps/tf2.csv', sep = ';')
display(df)

## Define the learning variables and the variable to predict

In [None]:
array = df.values
X = array[:,0:-1]
y = array[:,-1]

## Features selection

In [None]:
# feature extraction
selectKBest = SelectKBest(score_func = f_classif,  k = df.shape[1]//2)
selection = selectKBest.fit(X, y)
print(selection.scores_)
X_best = selection.transform(X)

# summarize selected features
display(pd.DataFrame(X_best))

## Cut the data set into a test set and a learning set

In [None]:
myTrainSize = 0.3 # 30% du jeu de données pour le test
myTestSize = 1 - myTrainSize # 70% du jeu de données pour l'entraînement
seed = 30

# Original X & y
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = myTrainSize, random_state = seed, test_size = myTestSize)

# X & y after the features selection
X_best_train, X_best_test, y_best_train, y_best_test = train_test_split(X_best, y, train_size = myTrainSize, random_state = seed, test_size = myTestSize)

<div align="center">
    <h1>
        Testing the first classifier
    </h1>
</div>

## GaussianNB classifier

In [None]:
clfGaussianNB = GaussianNB()

clfGaussianNB.fit(X_train, y_train)

resultGaussianNB = clfGaussianNB.predict(X_test)

print(f'accuracy : {accuracy_score(resultGaussianNB, y_test):.2f}')

## Display the confusion matrix and the classification report

In [None]:
print (f'Matrice de confusion :\n{confusion_matrix(y_test, resultGaussianNB)}')
print (f'Classification report :\n{classification_report(y_test, resultGaussianNB)}')

## Cross validate with 10 splits (Kfold)

In [None]:
seed = 7
myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)

## Apply the GaussianNB classifier and give the different accuracy for the 10 evaluations

In [None]:
clfGaussianNB = GaussianNB()

myScoring = 'accuracy'

score = cross_val_score(clfGaussianNB, X, y, cv = myKFold, scoring = myScoring)

print(f'Les différentes accuracy pour les 10 évaluations sont :\n{score}')
print(f'Accuracy moyenne : {score.mean():.2f} | Standard deviation : {score.std():.2f}')

<div align="center">
    <h1>
        Testing several classifiers
    </h1>
</div>

In [None]:
models = []

models.append(('KNN', KNeighborsClassifier()))      # GS Done
models.append(('CART', DecisionTreeClassifier()))   # GS Done
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC()))                       # GS Done
models.append(('RFO', RandomForestClassifier()))    # GS Done
models.append(('LR', LogisticRegression()))

## Without shuffle

In [None]:
seed = 7
myScoring = 'accuracy'
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, random_state = seed)
    startTime = time.time()
    score = cross_val_score(model, X, y, cv = myKFold, scoring = myScoring)
    endTime = time.time()
    scores.append(score)
    names.append(name)
    print(f'{name}\t({score.mean():.2f} | {score.std():.2f} | Time : {endTime - startTime:.2f})\n')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## With shuffle

In [None]:
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)
    startTime = time.time()
    score = cross_val_score(model, X, y, cv = myKFold, scoring = myScoring)
    endTime = time.time()
    scores.append(score)
    names.append(name)
    print(f'{name}\t({score.mean():.2f} | {score.std():.2f} | Time : {endTime - startTime:.2f})\n')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## Apply GridSearchCV to RandomForestClassifier

In [None]:
gridParam = {'n_estimators': [4, 6, 9], 
             'max_features': ['log2', 'sqrt','auto'], 
             'criterion': ['entropy', 'gini'], 
             'max_depth': [2, 3, 5, 10], 
             'min_samples_split': [2, 3, 5], 
             'min_samples_leaf': [1, 5, 8]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = RandomForestClassifier(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = -1, iid = True, return_train_score = True)

startTime = time.time()
clfGridSearchCV.fit(X_train, y_train)
endTime = time.time()

print(f'temps : {endTime - startTime:.2f}')
print(f'meilleur score : {clfGridSearchCV.best_score_:.2f}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

# tf1
# {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 6}
# {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 6}
# {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 9}
# {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 9}

#tf2
# {'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 9}

## Apply GridSearchCV to DecisionTreeClassifier

In [None]:
gridParam = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
             'criterion' : ['gini', 'entropy'], 
             'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid = gridParam, scoring = myScoring, cv = 10, n_jobs = -1, iid = True, return_train_score = True)

startTime = time.time()
clfGridSearchCV.fit(X_train, y_train)
endTime = time.time()

print(f'temps : {endTime - startTime:.2f}')
print(f'meilleur score : {clfGridSearchCV.best_score_:.2f}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

#tf1
# {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 1}
# {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 6}
# {'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 5}
# {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 1}

#tf2
# {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 4}

## Apply GridSearchCV to SVC

In [None]:
gridParam = {'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
             'gamma' : ['scale', 0.0001, 0.001, 0.01, 0.1, 1], 
             'kernel' : ['linear', 'poly', 'rbf']
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = SVC(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = 1, iid = True, return_train_score = True)

startTime = time.time()
clfGridSearchCV.fit(X_train, y_train)
endTime = time.time()

print(f'temps : {endTime - startTime:.2f}')
print(f'meilleur score : {clfGridSearchCV.best_score_:.2f}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

#tf2
# {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

## Apply GridSearchCV to KNeighborsClassifier

In [None]:
gridParam = {'n_neighbors': list(range(1,15)), 
              'metric': ['minkowski', 'euclidean', 'manhattan']
             }

myScoring = 'accuracy'
                        
clfGridSearchCV = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = -1, iid = True, return_train_score = True)

startTime = time.time()
clfGridSearchCV.fit(X_train, y_train)
endTime = time.time()

print(f'temps : {endTime - startTime:.2f}')
print(f'meilleur score : {clfGridSearchCV.best_score_:.2f}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Do a gridsearch taking the previous parameters

In [None]:
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier()
}

params = {
    'RandomForestClassifier' : [
        {'n_estimators' : [9, 6]}, 
        {'max_features' : ['auto', 'sqrt', 'log2']}, 
        {'criterion' : ['entropy', 'gini']}, 
        {'max_depth' : [10]}, 
        {'min_samples_split' : [2, 5]}, 
        {'min_samples_leaf' : [1, 5]}
    ], 
    'DecisionTreeClassifier' : [
        {'max_depth' : [9, 8]}, 
        {'criterion' : ['gini', 'entropy']}, 
        {'min_samples_leaf' : [1, 2, 3]}
    ],
    'SVC' : [
        {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
        {'gamma': ['scale', 0.0001, 0.001, 0.01, 0.1, 1]}, 
        {'kernel': ['linear', 'poly', 'rbf']}
    ],
    'KNeighborsClassifier' : [
        {'metric': ['minkowski', 'manhattan']}, 
        {'n_neighbors': [1, 2]}
    ]
}

In [None]:
class Result:
    def __init__(self, name, score, parameters, duration):
        self.name = name
        self.score = score
        self.parameters = parameters
        self.duration = duration
    def __repr__(self):
        return repr((self.name, self.score, self.parameters, self.duration))

results = []
myScoring = 'accuracy'

for key, value in classifiers.items():
    clfGridSearchCV = GridSearchCV(estimator = value, param_grid = params[key], scoring = myScoring, cv = 10, n_jobs = 1, iid = True)
    startTime = time.time()
    clfGridSearchCV.fit(X_train, y_train)
    endTime = time.time()
    result = Result(key, clfGridSearchCV.best_score_, clfGridSearchCV.best_estimator_, endTime - startTime)
    results.append(result)

results = sorted(results, key = lambda result: result.score, reverse = True)

print(f'')
print(f'Le meilleur resultat est celui du classifieur {results[0].name} :\n\tScore : {results[0].score:.2f}\n\tDuration : {results[0].duration:.2f}\n\tParameters :\n\t\t{results[0].parameters}')

print(f'\nTous les résultats :\n')
for result in results:
    print(f'\t{result.name} classifier :\n\tScore : {result.score:.2f}\n\tDuration : {result.duration:.2f}\n\tParameters :\n\t\t{result.parameters}\n')

## Save the best learned model

In [None]:
pickle.dump(results[0].parameters, open('models/best.sav', 'wb'))

## Reload the best model to test it with y_test

In [None]:
clf_loaded = pickle.load(open('models/best.sav', 'rb'))

print(f'Modèle chargé :\n{clf_loaded}\n')

result = clf_loaded.predict(X_test)

cpt_mal_classes(y_test, result)

print(f'Accuracy : {accuracy_score(result, y_test):.2f}\n')
print(f'Matrice de confusion :\n{confusion_matrix(y_test, result)}\n')
print(f'Classification report :\n{classification_report(y_test, result)}')

# Pipeline

In [None]:
#pipeline = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', DecisionTreeClassifier(random_state = 42))])
pipeline = Pipeline([('vect', LabelEncoder()), ('scl', StandardScaler()), ('clf', SVC())])

t0 = time()
print ("Lancement du fit \n")
pipeline.fit(X_train, y_train)
print("Fit réalisé en %0.3fs" % (time() - t0))
t0 = time()
print ("Lancement de la prédiction \n")
result = pipeline.predict(X_test)
print("Prédiction réalisée en %0.3fs" % (time() - t0))
print('\n accuracy:',accuracy_score(result, y_test),'\n')
conf = confusion_matrix(y_test, result)
print ('\n matrice de confusion \n',conf)
print ('\n',classification_report(y_test, result))

pickle.dump(pipeline, open('pipelines/thebestone.pkl', 'wb'))

clf_loaded = pickle.load(open('pipelines/thebestone.pkl', 'rb'))
print(f'Pipeline chargé :\n{clf_loaded}\n')