<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Imports
    </h1>
</div>

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
import pickle

from enum import Enum
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore", category = FutureWarning)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Loading the dataset
    </h1>
</div>

In [None]:
labels = ["ID", "ClaimReviewAuthor", "ClaimReviewAuthorName", "ClaimReviewAuthorURL",
          "ClaimReviewClaimReviewed", "ClaimReviewDatePublished", "ClaimReviewSource", "ClaimReviewURL",
          "CreativeWorkAuthorName", "CreativeWorkAuthorSameAs", "CreativeWorkDatePublished", "ExtraBody",
          "ExtraEntitiesAuthor", "ExtraEntitiesBody", "ExtraEntitiesClaimReviewClaimReviewed", "ExtraEntitiesKeywords",
          "ExtraReferedLinks", "ExtraTags", "ExtraTitle", "RatingAlternateName",
          "RatingBestRating", "RatingRatingValue", "RatingWorstRating"]

labelsClaimsKG = ["ID", "Text", "Date", "TruthRating", "RatingName", "Author", "Headline",
                  "NamedEntitiesClaim", "NamedEntitiesArticle", "Keywords", "Source", "SourceURL", "Link", "Language"]

df = pd.read_csv('datasets/ClaimsKG.csv', sep = ',', names = labelsClaimsKG, skiprows = 1, nrows = 1000)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Printing informations
    </h1>
</div>

## Shape

In [None]:
print(f'Shape :\n{df.shape}')

## Informations

In [None]:
print(f'Informations :')
df.info()

## Description

In [None]:
print(f'Description :')
display(df.describe())

## Printing some lines

In [None]:
print(f'Printing some lines :')
display(df.head())

## Affichage d'informations sur toutes les colonnes

In [None]:
for column in df.columns:
      print(f'Nombre de valeurs nulles pour {column} :\n{df[column].isnull().value_counts()}\n')

## Affichage des colonnes vides

In [None]:
array = []
for column in df.columns:
    value = False
    value = df[column].isnull().any()
    if value:
        array.append(column)
print(f'Nombre de colonnes vides : {len(array)}\nLes colonnes vide sont :\n{array}')

## Description de toutes les colonnes

In [None]:
for column in df.columns:
    display(df[column].describe())

## Get a series of unique values in each column of the dataframe

In [None]:
for column in df.columns:
    uniqueValues = df[column].unique()
    print(f'Number of unique elements in column {column} : {len(uniqueValues)}, values & type :\n{uniqueValues}\n')

## Affichage du nombre des différents TruthRating

In [None]:
print(f'La colonne TruthRating contient :')
print(f'{df[df["TruthRating"]==-1]["ID"].count()} Other')
print(f'{df[df["TruthRating"]==1]["ID"].count()} False')
print(f'{df[df["TruthRating"]==2]["ID"].count()} Mixture')
print(f'{df[df["TruthRating"]==3]["ID"].count()} True')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Visualization
    </h1>
</div>

In [None]:
chart = sns.countplot(x = 'Source', data = df)
plt.setp(chart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
chart = sns.catplot(x = 'Source', col = 'RatingName', kind = 'count', data = df)
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
chart = sns.catplot('Source', data = df, hue = 'RatingName', kind = 'count')
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [None]:
sns.heatmap(df.isnull(), cbar = False)
plt.show()

<div class="alert alert-block alert-info" align="center">
    <h1>
        Mapping ratings
    </h1>
</div>

In [None]:
class NormalizedRatings(Enum):
    FALSE = 1
    MIXTURE = 2
    TRUE = 3
    OTHER = -1

_normalization_dictionary = {  # type: Dict[str, Dict[str,NormalizedRatings]]
    "politifact": {  # type: Dict[str,NormalizedRatings]
        'incorrect': NormalizedRatings.FALSE,
        'pants-fire': NormalizedRatings.FALSE,
        'pants on fire': NormalizedRatings.FALSE,
        'pants on fire!': NormalizedRatings.FALSE,
        'false': NormalizedRatings.FALSE,
        'mostly correct': NormalizedRatings.MIXTURE,
        'mostly false': NormalizedRatings.MIXTURE,
        'barely true': NormalizedRatings.MIXTURE,
        'half true': NormalizedRatings.MIXTURE,
        'half-true': NormalizedRatings.MIXTURE,
        'mostly true': NormalizedRatings.MIXTURE,
        'true': NormalizedRatings.TRUE,
        'correct': NormalizedRatings.TRUE
    },
    "snopes": {  # type: Dict[str,NormalizedRatings]
        'false': NormalizedRatings.FALSE,
        'legend': NormalizedRatings.FALSE,
        'mixture': NormalizedRatings.MIXTURE,
        'mixture:': NormalizedRatings.MIXTURE,
        'true': NormalizedRatings.TRUE,
        'mostly false': NormalizedRatings.MIXTURE,
        'mostly true': NormalizedRatings.MIXTURE,
        'partly true': NormalizedRatings.MIXTURE,
        'MIXTURE OF TRUE AND FALSE INFORMATION': NormalizedRatings.MIXTURE,
        'MIXTURE OF TRUE AND FALSE INFORMATION:': NormalizedRatings.MIXTURE,
        'MIXTURE OF ACCURATE AND  INACCURATE INFORMATION': NormalizedRatings.MIXTURE
    },
    "africacheck": {  # type: Dict[str,NormalizedRatings]
        'incorrect': NormalizedRatings.FALSE,
        'mostly-correct': NormalizedRatings.MIXTURE,
        'correct': NormalizedRatings.TRUE
    },
    "factscan": {  # type: Dict[str,NormalizedRatings]
        'false': NormalizedRatings.FALSE,
        'true': NormalizedRatings.TRUE,
        'Misleading': NormalizedRatings.OTHER
    },
    "truthorfiction": {  # type: Dict[str,NormalizedRatings]
        'fiction': NormalizedRatings.FALSE,
        'truth': NormalizedRatings.TRUE,
        'truth & fiction': NormalizedRatings.MIXTURE,
        'mostly fiction': NormalizedRatings.MIXTURE,
        'truth & misleading': NormalizedRatings.MIXTURE,
        'mostly truth': NormalizedRatings.MIXTURE
    },
    "checkyourfact": {  # type: Dict[str,NormalizedRatings]
        'False': NormalizedRatings.FALSE,
        'True': NormalizedRatings.TRUE,
        'Mostly True': NormalizedRatings.MIXTURE,
        'true/false': NormalizedRatings.MIXTURE,
        'truth & misleading': NormalizedRatings.MIXTURE,
        'mostly truth': NormalizedRatings.MIXTURE,
        'misleading': NormalizedRatings.MIXTURE
    },
    "factcheck_aap": {
        "True": NormalizedRatings.TRUE,
        "False": NormalizedRatings.FALSE,
        "Mostly True": NormalizedRatings.MIXTURE,
        "Mostly False": NormalizedRatings.MIXTURE,
        "Somewhat True": NormalizedRatings.MIXTURE,
        "Somewhat False": NormalizedRatings.MIXTURE
    },
    "factuel_afp_fr": {
        'Faux': NormalizedRatings.FALSE,
        'Totalement faux': NormalizedRatings.FALSE,
        'Démenti': NormalizedRatings.FALSE,
        "C'est une oeuvre de fiction": NormalizedRatings.FALSE,
        'Vrai': NormalizedRatings.TRUE,
        'Totalement Vrai': NormalizedRatings.TRUE,
        'Plutôt vrai': NormalizedRatings.MIXTURE,
        'Trompeur': NormalizedRatings.MIXTURE,
        'trompeur': NormalizedRatings.MIXTURE,
        'Plutôt faux': NormalizedRatings.MIXTURE,
        'Presque': NormalizedRatings.MIXTURE,
        'Mélangé': NormalizedRatings.MIXTURE,
        'Mélange': NormalizedRatings.MIXTURE,
        'Inexact': NormalizedRatings.MIXTURE,
        'Incertain': NormalizedRatings.MIXTURE,
        'Imprécis': NormalizedRatings.MIXTURE,
        'Exagéré': NormalizedRatings.MIXTURE,
        'Douteux': NormalizedRatings.MIXTURE,
    },
    "factcheck_afp": {
        'False': NormalizedRatings.FALSE,
        'Fake': NormalizedRatings.FALSE,
        'Mixed': NormalizedRatings.MIXTURE,
        'Hoax': NormalizedRatings.FALSE,
        'Falso': NormalizedRatings.FALSE,
        'APRIL FOOL': NormalizedRatings.FALSE
    },
    "fullfact": {
        'Correct': NormalizedRatings.TRUE,
        'Incorrect': NormalizedRatings.FALSE,
        'Not quite': NormalizedRatings.MIXTURE
    }
}

def _standardize_name(original_name: str):
    return original_name.strip().lower().replace("!", "").replace(":", "").replace("-", " ")

def normalize(source_name, original_name) -> NormalizedRatings:
    '''
    Generate a normalized rating from the original ratings on each respective site
    :param original_name:
    :return normalized_rating: NormalizedRating
    '''
    try:
        source = _normalization_dictionary[source_name]
        normalized_value = source[_standardize_name(original_name)]
    except KeyError:
        normalized_value = NormalizedRatings.OTHER
    return normalized_value

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

<div align="center">
    <h1>
        General pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [None]:
df = df.drop(['TruthRating', 'SourceURL', 'Language'], axis = 1)
display(df.head())

## Deleting claims with OTHER RatingName

In [None]:
df = df[df.RatingName != 'OTHER']

## Replacing "Unknown" & NaN by "Inconnue"

In [None]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.head())

sns.heatmap(df.isnull(), cbar = False)
plt.show()

## TRUE VS FALSE datafram

In [None]:
dfTvsF = df.copy()

# Suppression de MIXTURE
dfTvsF = dfTvsF[dfTvsF.RatingName != 'MIXTURE']

replace_map = {'FALSE': 1, 'TRUE': 2}

# Creating a new column with new RatingName (Prediction)
dfTvsF["Predection"] = dfTvsF['RatingName'].map(replace_map)

# Removing RatingName
dfTvsF = dfTvsF.drop(['RatingName'], axis = 1)

display(dfTvsF.head())

## TRUE/FALSE VS MIXTURE datafram

In [None]:
dfTFvsM = df.copy()

replace_map = {'FALSE': 1, 'TRUE': 1, 'MIXTURE': 2}

# Creating a new column with new RatingName (Prediction)
dfTFvsM["Predection"] = dfTFvsM['RatingName'].map(replace_map)

# Removing RatingName
dfTFvsM = dfTFvsM.drop(['RatingName'], axis = 1)

display(dfTFvsM.head())

<div align="center">
    <h1>
        First attemp on TRUE vs FALSE
    </h1>
</div>

## Copy of the data frame

In [None]:
dfTvsF1 = dfTvsF.copy()
display(dfTvsF1.head())

## Transform data

In [None]:
classLabelEncoder = LabelEncoder()

dfTvsF1["ID"]=classLabelEncoder.fit_transform(dfTvsF1["ID"])
dfTvsF1["Text"]=classLabelEncoder.fit_transform(dfTvsF1["Text"])
dfTvsF1["Date"]=classLabelEncoder.fit_transform(dfTvsF1["Date"])
dfTvsF1["Author"]=classLabelEncoder.fit_transform(dfTvsF1["Author"])
dfTvsF1["Headline"]=classLabelEncoder.fit_transform(dfTvsF1["Headline"])
dfTvsF1["NamedEntitiesClaim"]=classLabelEncoder.fit_transform(dfTvsF1["NamedEntitiesClaim"])
dfTvsF1["NamedEntitiesArticle"]=classLabelEncoder.fit_transform(dfTvsF1["NamedEntitiesArticle"])
dfTvsF1["Keywords"]=classLabelEncoder.fit_transform(dfTvsF1["Keywords"])
dfTvsF1["Source"]=classLabelEncoder.fit_transform(dfTvsF1["Source"])
dfTvsF1["Link"]=classLabelEncoder.fit_transform(dfTvsF1["Link"])

display(dfTvsF1.head())

## Saving the transformed data

In [None]:
dfTvsF1.to_csv('datasets/attemp1.csv', sep = ';', index = False)

<div align="center">
    <h1>
        Second attemp on TRUE vs FALSE
    </h1>
</div>

## Copy of the data frame

In [None]:
dfTvsF2 = dfTvsF.copy()
display(dfTvsF2.head())

## Transform data

In [None]:
display(dfTvsF2.head())

## Saving the transformed data

In [None]:
dfTvsF2.to_csv('datasets/attemp2.csv', sep = ';', index = False)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Classification
    </h1>
</div>

<div align="center">
    <h1>
        Preparing attemp 1 data for classification
    </h1>
</div>

## Reading the transformed data for the classification

In [None]:
dfClassification1 = pd.read_csv('datasets/attemp1.csv', sep = ';')
display(dfClassification1.head())
dfClassification1.info()

## Define the learning variables and the variable to predict

In [None]:
array1 = dfClassification1.values
X1 = array1[:,[0,1]]
y1 = array1[:,3]

## Cut the data set into a test set and a learning set

In [None]:
myTrainSize = 0.3 # 30% du jeu de données pour le test
myTestSize = 1 - myTrainSize # 70% du jeu de données pour l'entraînement
seed = 30

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, train_size = myTrainSize, random_state = seed, test_size = myTestSize)

<div align="center">
    <h1>
        Testing the first classifier on attemp 1
    </h1>
</div>

## GaussianNB classifier

In [None]:
clfGaussianNB = GaussianNB()

clfGaussianNB.fit(X_train1, y_train1)

resultGaussianNB = clfGaussianNB.predict(X_test1)

print(f'accuracy : {accuracy_score(resultGaussianNB, y_test1)}')

## Display the confusion matrix and the classification report

In [None]:
print (f'Matrice de confusion :\n{confusion_matrix(y_test1, resultGaussianNB)}')
print (f'Classification report :\n{classification_report(y_test1, resultGaussianNB)}')

## Cross validate with 10 splits (Kfold)

In [None]:
seed = 7
myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)

## Apply the GaussianNB classifier and give the different accuracy for the 10 evaluations

In [None]:
clfGaussianNB = GaussianNB()

myScoring = 'accuracy'

score = cross_val_score(clfGaussianNB, X1, y1, cv = myKFold, scoring = myScoring)

print(f'Les différentes accuracy pour les 10 évaluations sont :\n{score}')
print(f'Accuracy moyenne : {score.mean()} | Standard deviation : {score.std()}')

<div align="center">
    <h1>
        Testing several classifiers
    </h1>
</div>

In [None]:
seed = 7
myScoring = 'accuracy'
models = []

models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma = 'auto')))
models.append(('RFO', RandomForestClassifier()))

## Without shuffle

In [None]:
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, random_state = seed)
    score = cross_val_score(model, X1, y1, cv = myKFold, scoring = myScoring)
    scores.append(score)
    names.append(name)
    print(f'{name} : {score.mean()} | {score.std()}')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## With shuffle

In [None]:
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)
    score = cross_val_score(model, X1, y1, cv = myKFold, scoring = myScoring)
    scores.append(score)
    names.append(name)
    print(f'{name} : {score.mean()} | {score.std()}')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## Apply GridSearchCV to RandomForestClassifier

In [None]:
gridParam = {'n_estimators': [4, 6, 9], 
             'max_features': ['log2', 'sqrt','auto'], 
             'criterion': ['entropy', 'gini'], 
             'max_depth': [2, 3, 5, 10], 
             'min_samples_split': [2, 3, 5], 
             'min_samples_leaf': [1, 5, 8]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = RandomForestClassifier(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = -1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Apply GridSearchCV to DecisionTreeClassifier

In [None]:
gridParam = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
             'criterion' : ['gini', 'entropy'], 
             'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid = gridParam, scoring = myScoring, cv = 10, n_jobs = -1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)  

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Apply GridSearchCV to SVC

In [None]:
gridParam = {'C' : [0.001, 0.01, 0.1, 1, 10], 
             'gamma' : [0.001, 0.01, 0.1, 1], 
             'kernel' : ['linear', 'rbf']
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = SVC(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = 1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Do a gridsearch taking the previous parameters

In [None]:
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVM' : SVC()
}

params = {
    'RandomForestClassifier' : [
        {'n_estimators' : [4, 6, 9]}, 
        {'max_features' : ['log2', 'sqrt', 'auto']}, 
        {'criterion' : ['entropy', 'gini']}, 
        {'max_depth' : [2, 3, 5, 10]}, 
        {'min_samples_split' : [2, 3, 5]}, 
        {'min_samples_leaf' : [1, 5, 8]}
    ], 
    'DecisionTreeClassifier' : [
        {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, 
        {'criterion' : ['gini', 'entropy']}, 
        {'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
    ],
    'SVM' : [
        {'C': [1, 0.001]}, 
        {'gamma': [0.001]}, 
        {'kernel': ['linear']}
    ]
}

In [None]:
class Result:
    def __init__(self, name, score, parameters):
        self.name = name
        self.score = score
        self.parameters = parameters
    def __repr__(self):
        return repr((self.name, self.score, self.parameters))

results = []
myScoring = 'accuracy'

for key, value in classifiers.items():
    clfGridSearchCV = GridSearchCV(estimator = value, param_grid = params[key], scoring = myScoring, cv = 10, n_jobs = 1, iid = True)
    clfGridSearchCV.fit(X_train1, y_train1)
    result = Result(key, clfGridSearchCV.best_score_, clfGridSearchCV.best_estimator_)
    results.append(result)

results = sorted(results, key = lambda result: result.score, reverse = True)

print(f'Le meilleur resultat :')
print(f'\tClassifier : {results[0].name} | score : {results[0].score} | parameters :\n\t\t{results[0].parameters}')

print(f'Tous les résultats :')
for result in results:
    print(f'\n\tClassifier : {result.name} | score : {result.score} | parameters :\n\t\t{result.parameters}')

## Save the best learned model

In [None]:
filename = 'modeles/attemp1.sav'
pickle.dump(results[0].parameters, open(filename, 'wb'))

## Reload the best model to test it with y_test

In [None]:
filename = 'modeles/attemp1.sav'
clfLoaded = pickle.load(open(filename, 'rb'))
print(f'Modèle chargé :\n{clfLoaded}\n')

result = clfLoaded.predict(X_test1)

print(f'Accuracy : {accuracy_score(result, y_test1)}\n')
print(f'Matrice de confusion :\n{confusion_matrix(y_test1, result)}\n')
print(f'Classification report :\n{classification_report(y_test1, result)}')