<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Imports
    </h1>
</div>

In [687]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
import nltk
import pickle
import unicodedata
import inflect
import re
import time

from enum import Enum
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample

warnings.filterwarnings("ignore", category = FutureWarning)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Downloads
    </h1>
</div>

In [None]:
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Declarations
    </h1>
</div>

In [659]:
# Mapping ratings
class NormalizedRatings(Enum):
    FALSE = 1
    MIXTURE = 2
    TRUE = 3
    OTHER = -1

_normalization_dictionary = {  # type: Dict[str, Dict[str,NormalizedRatings]]
    "politifact": {  # type: Dict[str,NormalizedRatings]
        'incorrect': NormalizedRatings.FALSE,
        'pants-fire': NormalizedRatings.FALSE,
        'pants on fire': NormalizedRatings.FALSE,
        'pants on fire!': NormalizedRatings.FALSE,
        'false': NormalizedRatings.FALSE,
        'mostly correct': NormalizedRatings.MIXTURE,
        'mostly false': NormalizedRatings.MIXTURE,
        'barely true': NormalizedRatings.MIXTURE,
        'half true': NormalizedRatings.MIXTURE,
        'half-true': NormalizedRatings.MIXTURE,
        'mostly true': NormalizedRatings.MIXTURE,
        'true': NormalizedRatings.TRUE,
        'correct': NormalizedRatings.TRUE
    },
    "snopes": {  # type: Dict[str,NormalizedRatings]
        'false': NormalizedRatings.FALSE,
        'legend': NormalizedRatings.FALSE,
        'mixture': NormalizedRatings.MIXTURE,
        'mixture:': NormalizedRatings.MIXTURE,
        'true': NormalizedRatings.TRUE,
        'mostly false': NormalizedRatings.MIXTURE,
        'mostly true': NormalizedRatings.MIXTURE,
        'partly true': NormalizedRatings.MIXTURE,
        'MIXTURE OF TRUE AND FALSE INFORMATION': NormalizedRatings.MIXTURE,
        'MIXTURE OF TRUE AND FALSE INFORMATION:': NormalizedRatings.MIXTURE,
        'MIXTURE OF ACCURATE AND  INACCURATE INFORMATION': NormalizedRatings.MIXTURE
    },
    "africacheck": {  # type: Dict[str,NormalizedRatings]
        'incorrect': NormalizedRatings.FALSE,
        'mostly-correct': NormalizedRatings.MIXTURE,
        'correct': NormalizedRatings.TRUE
    },
    "factscan": {  # type: Dict[str,NormalizedRatings]
        'false': NormalizedRatings.FALSE,
        'true': NormalizedRatings.TRUE,
        'Misleading': NormalizedRatings.OTHER
    },
    "truthorfiction": {  # type: Dict[str,NormalizedRatings]
        'fiction': NormalizedRatings.FALSE,
        'truth': NormalizedRatings.TRUE,
        'truth & fiction': NormalizedRatings.MIXTURE,
        'mostly fiction': NormalizedRatings.MIXTURE,
        'truth & misleading': NormalizedRatings.MIXTURE,
        'mostly truth': NormalizedRatings.MIXTURE
    },
    "checkyourfact": {  # type: Dict[str,NormalizedRatings]
        'False': NormalizedRatings.FALSE,
        'True': NormalizedRatings.TRUE,
        'Mostly True': NormalizedRatings.MIXTURE,
        'true/false': NormalizedRatings.MIXTURE,
        'truth & misleading': NormalizedRatings.MIXTURE,
        'mostly truth': NormalizedRatings.MIXTURE,
        'misleading': NormalizedRatings.MIXTURE
    },
    "factcheck_aap": {
        "True": NormalizedRatings.TRUE,
        "False": NormalizedRatings.FALSE,
        "Mostly True": NormalizedRatings.MIXTURE,
        "Mostly False": NormalizedRatings.MIXTURE,
        "Somewhat True": NormalizedRatings.MIXTURE,
        "Somewhat False": NormalizedRatings.MIXTURE
    },
    "factuel_afp_fr": {
        'Faux': NormalizedRatings.FALSE,
        'Totalement faux': NormalizedRatings.FALSE,
        'Démenti': NormalizedRatings.FALSE,
        "C'est une oeuvre de fiction": NormalizedRatings.FALSE,
        'Vrai': NormalizedRatings.TRUE,
        'Totalement Vrai': NormalizedRatings.TRUE,
        'Plutôt vrai': NormalizedRatings.MIXTURE,
        'Trompeur': NormalizedRatings.MIXTURE,
        'trompeur': NormalizedRatings.MIXTURE,
        'Plutôt faux': NormalizedRatings.MIXTURE,
        'Presque': NormalizedRatings.MIXTURE,
        'Mélangé': NormalizedRatings.MIXTURE,
        'Mélange': NormalizedRatings.MIXTURE,
        'Inexact': NormalizedRatings.MIXTURE,
        'Incertain': NormalizedRatings.MIXTURE,
        'Imprécis': NormalizedRatings.MIXTURE,
        'Exagéré': NormalizedRatings.MIXTURE,
        'Douteux': NormalizedRatings.MIXTURE,
    },
    "factcheck_afp": {
        'False': NormalizedRatings.FALSE,
        'Fake': NormalizedRatings.FALSE,
        'Mixed': NormalizedRatings.MIXTURE,
        'Hoax': NormalizedRatings.FALSE,
        'Falso': NormalizedRatings.FALSE,
        'APRIL FOOL': NormalizedRatings.FALSE
    },
    "fullfact": {
        'Correct': NormalizedRatings.TRUE,
        'Incorrect': NormalizedRatings.FALSE,
        'Not quite': NormalizedRatings.MIXTURE
    }
}

def _standardize_name(original_name: str):
    return original_name.strip().lower().replace("!", "").replace(":", "").replace("-", " ")

def normalize(source_name, original_name) -> NormalizedRatings:
    '''
    Generate a normalized rating from the original ratings on each respective site
    :param original_name:
    :return normalized_rating: NormalizedRating
    '''
    try:
        source = _normalization_dictionary[source_name]
        normalized_value = source[_standardize_name(original_name)]
    except KeyError:
        normalized_value = NormalizedRatings.OTHER
    return normalized_value

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    #words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = normalize(tokens)
    text="".join([" "+i for i in tokens]).strip()
    return text

# Creating csv
def combine_csv():
    pol1 = pd.read_csv('datasets/pol1.csv', sep = ',')
    pol2 = pd.read_csv('datasets/pol2.csv', sep = ',')
    sno1 = pd.read_csv('datasets/sno1.csv', sep = ',')
    sno2 = pd.read_csv('datasets/sno2.csv', sep = ',')
    reste = pd.read_csv('datasets/reste.csv', sep = ',')

    pol = pol1.append(pol2, sort = False).reset_index(drop = True)
    sno = sno1.append(sno2, sort = False).reset_index(drop = True)
    polsno = pol.append(sno, sort = False).reset_index(drop = True)
    claimskg = polsno.append(reste, sort = False).reset_index(drop = True)

    polsno.to_csv('datasets/polsno.csv', sep = ',', index = False)
    claimskg.to_csv('datasets/claimskg.csv', sep = ',', index = False)

## Up-sampling & Down-sampling

In [None]:
def UpSample(df):
    print("Class before up-sample")
    display(pd.DataFrame(df['RatingName'].value_counts()))
    dfUpSampling_majority = df[df.RatingName == df['RatingName'].value_counts().index.tolist()[0]]
    dfUpSampling_minority = df[df.RatingName == df['RatingName'].value_counts().index.tolist()[-1]]

    dfUpSampling_minority_upsampled = resample(dfUpSampling_minority, replace=True, n_samples=df['RatingName'].value_counts().tolist()[0], random_state=123) 

    df_upsampled = pd.concat([dfUpSampling_majority, dfUpSampling_minority_upsampled])
    print("Class after up-sample")
    display(pd.DataFrame(df_upsampled['RatingName'].value_counts()))
    #index recheck

def DownSample(df):
    print("Class before down-sample")
    display(pd.DataFrame(df['RatingName'].value_counts()))
    dfDownSampling_majority = df[df.RatingName == df['RatingName'].value_counts().index.tolist()[0]]
    dfDownSampling_minority = df[df.RatingName == df['RatingName'].value_counts().index.tolist()[-1]]

    dfDownSampling_majority_Downsampled = resample(dfDownSampling_majority, replace=False, n_samples=df['RatingName'].value_counts().tolist()[-1], random_state=123) 

    df_Downsampled = pd.concat([dfDownSampling_majority_Downsampled, dfDownSampling_minority])
    print("Class after down-sample")
    display(pd.DataFrame(df_Downsampled['RatingName'].value_counts()))
    #index recheck

<div class="alert alert-block alert-info" align="center">
    <h1>
        Loading the dataset
    </h1>
</div>

In [677]:
labels = ["ID", "ClaimReviewAuthor", "ClaimReviewAuthorName", "ClaimReviewAuthorURL",
          "ClaimReviewClaimReviewed", "ClaimReviewDatePublished", "ClaimReviewSource", "ClaimReviewURL",
          "CreativeWorkAuthorName", "CreativeWorkAuthorSameAs", "CreativeWorkDatePublished", "ExtraBody",
          "ExtraEntitiesAuthor", "ExtraEntitiesBody", "ExtraEntitiesClaimReviewClaimReviewed", "ExtraEntitiesKeywords",
          "ExtraReferedLinks", "ExtraTags", "ExtraTitle", "RatingAlternateName",
          "RatingBestRating", "RatingRatingValue", "RatingWorstRating"]

labelsClaimsKG = ["ID", "Text", "Date", "TruthRating", "RatingName", "Author", "Headline",
                  "NamedEntitiesClaim", "NamedEntitiesArticle", "Keywords", "Source", "SourceURL", "Link", "Language"]

df = pd.read_csv('datasets/ClaimsKGTest.csv', sep = ',', names = labelsClaimsKG, skiprows = 1, nrows = 1000)
#df = df.sample(frac = 1).reset_index(drop = True)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Printing informations
    </h1>
</div>

## Shape

In [661]:
print(f'Shape :\n{df.shape}')

## Informations

In [642]:
print(f'Informations :')
df.info()

## Description

In [643]:
print(f'Description :')
display(df.describe())

## Printing some lines

In [667]:
print(f'Printing some lines :')
display(df.head())

Printing some lines :


Unnamed: 0,ID,Text,Date,TruthRating,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,SourceURL,Link,Language
0,http://data.gesis.org/claimskg/claim_review/1f...,An Oklahoma teacher was fired for praying when...,Unknown,1,FALSE,Unknown,Teacher Fired for Praying in School During Okl...,"Craigslist,Jerry. “Moore,Match.com,Moore, Ok...",,ASP Article,snopes,http://www.snopes.com,https://www.snopes.com/fact-check/storm-warning/,English
1,http://data.gesis.org/claimskg/claim_review/79...,Senator Ted Cruz claimed flooding in Texas was...,Unknown,1,FALSE,Unknown,Ted Cruz: The Texas Floods Were Caused by Nati...,"Native American,Rain Dances,Ted Cruz,Twitter,f...","Native American,Ted Cruz,Texas,rain dances","ASP Article, Not Necessarily The News",snopes,http://www.snopes.com,https://www.snopes.com/fact-check/rain-schtick/,English
2,http://data.gesis.org/claimskg/claim_review/4b...,There has been a rise in deaths from heart dis...,Unknown,-1,OTHER,Unknown,,,,"Home,Health,How healthy are we?",fullfact,https://fullfact.org/,https://fullfact.org/health/heart-disease-deat...,English
3,http://data.gesis.org/claimskg/claim_review/64...,A photograph shows seal hugging a beluga whale.,Unknown,1,FALSE,Unknown,Does This Photograph Show a Seal Hugging a Bel...,"Facebook,Getty Images,beluga whale,to show,wor...",beluga whale,"beluga whale, fauxtography, seal",snopes,http://www.snopes.com,https://www.snopes.com/fact-check/whale-hugs-s...,English
4,http://data.gesis.org/claimskg/claim_review/a4...,Obama Sex Video,Unknown,-1,OTHER,Unknown,Obama Sex Video Virus,"Barack Obama,CNET Networks,CNET Networks, Inc,...",,"ASP Article, Virus Hoaxes & Realities",snopes,http://www.snopes.com,https://www.snopes.com/fact-check/obama-sex-vi...,English


## Affichage d'informations sur toutes les colonnes

In [645]:
for column in df.columns:
      print(f'Nombre de valeurs nulles pour {column} :\n{df[column].isnull().value_counts()}\n')

## Affichage des colonnes vides

In [646]:
array = []
for column in df.columns:
    value = False
    value = df[column].isnull().any()
    if value:
        array.append(column)
print(f'Nombre de colonnes vides : {len(array)}\nLes colonnes vide sont :\n{array}')

## Description de toutes les colonnes

In [647]:
for column in df.columns:
    display(df[column].describe())

## Get a series of unique values in each column of the dataframe

In [648]:
for column in df.columns:
    uniqueValues = df[column].unique()
    print(f'Number of unique elements in column {column} : {len(uniqueValues)}, values & type :\n{uniqueValues}\n')

Number of unique elements in column ID : 1000, values & type :
['http://data.gesis.org/claimskg/claim_review/becd1657-d104-5a3e-94de-61f26b8e3f6e'
 'http://data.gesis.org/claimskg/claim_review/8a8a0f17-b1ec-5f86-8ad4-4bb0f1dd5cf5'
 'http://data.gesis.org/claimskg/claim_review/c215a999-c7cf-5c9f-bada-bcddfccd9a56'
 'http://data.gesis.org/claimskg/claim_review/1eff01a2-4c7d-5a9c-928a-1fa9a888ddeb'
 'http://data.gesis.org/claimskg/claim_review/0b7ea833-4928-5405-b0e0-12b2b31482b6'
 'http://data.gesis.org/claimskg/claim_review/a18e6db1-5ab3-5cbd-b913-155ba758039d'
 'http://data.gesis.org/claimskg/claim_review/b46c9e5a-3d3e-5292-aec5-65e9aa4c1f16'
 'http://data.gesis.org/claimskg/claim_review/b1f0ccba-1f2f-5e3f-b2a3-827a4f21dc1d'
 'http://data.gesis.org/claimskg/claim_review/78b5ce3e-a8db-52a2-8c4e-5e1a71d651d3'
 'http://data.gesis.org/claimskg/claim_review/9c52e9fc-3231-52b6-944c-94ca5f80abd5'
 'http://data.gesis.org/claimskg/claim_review/ad4543c1-9a8b-59b6-b68f-0c5f2a8ab3ee'
 'http://data

Number of unique elements in column Headline : 988, values & type :
['Did Malia Obama Cash a $1.2 Million Check?' 'High Diver Saved By Cross'
 "Moran says drilling off Virginia's coast will net only $40 million for U.S. over 10 years"
 'Health care advocacy group blasts insurers for CEO pay packages'
 'Ted Cruz: Vets Should Sell Cookies for Funding, Like Girl Scouts'
 'Did Airline Passengers Mistake a Hijacking for a ‘Candid Camera’ Stunt?'
 "Sen. Bill Nelson says offshore drilling won't pay for Florida schools"
 'Luck runs out for student on lottery claim'
 'Barack Obama ad says manufacturing jobs fared worse in Massachusetts under Romney'
 'Panda Express 15th Anniversary Offer'
 'Is there Planned Parenthood funding in the bill that stops a government shutdown?'
 'Americans United for Life says 1,270 babies died after attempted abortions in 2010'
 'Does the U.S. admit more legal immigrants than the rest of the world combined?'
 'Newt Gingrich accuses Obama administration of cutting de

## Affichage du nombre des différents TruthRating

In [649]:
print(f'La colonne TruthRating contient :')
print(f'{df[df["TruthRating"]==-1]["ID"].count()} Other')
print(f'{df[df["TruthRating"]==1]["ID"].count()} False')
print(f'{df[df["TruthRating"]==2]["ID"].count()} Mixture')
print(f'{df[df["TruthRating"]==3]["ID"].count()} True')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Visualization
    </h1>
</div>

In [650]:
chart = sns.countplot(x = 'Source', data = df)
plt.setp(chart.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [651]:
chart = sns.catplot(x = 'Source', col = 'RatingName', kind = 'count', data = df)
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [652]:
chart = sns.catplot('Source', data = df, hue = 'RatingName', kind = 'count')
for ax in chart.axes.ravel():
    plt.setp(ax.get_xticklabels(), rotation = 45, horizontalalignment = 'right')
plt.show()

In [653]:
sns.heatmap(df.isnull(), cbar = False)
plt.show()

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

<div align="center">
    <h1>
        General pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [678]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

## Remove unnecessary rows

In [679]:
# Deleting claims with OTHER RatingName
df = df[df.RatingName != 'OTHER']

display(df.head())

## Replacing "Unknown" & NaN by "Inconnue"

In [680]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.head())

## Text cleaning

In [681]:
for column in df.columns:
    if(column != 'RatingName'):
        df[column] = df[column].apply(lambda x: clean_text(x))

display(df.head())
display(df.describe())

## To do the treatments

In [705]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer = TfidfVectorizer()

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

# How to use
#df = pd.DataFrame(standardScaler.fit_transform(df), columns = ['Name'])

<div align="center">
    <h1>
        TRUE vs FALSE
    </h1>
</div>

In [714]:
dfTvsF = df.copy()

# Suppression de MIXTURE
dfTvsF = dfTvsF[dfTvsF.RatingName != 'MIXTURE']

# Splitting the datafram
dfTvsFHeadlineText = dfTvsF["Headline"] + " " + dfTvsF["Text"]
dfTvsFRatingName = dfTvsF['RatingName']
dfTvsFAuthor = dfTvsF['Author']
dfTvsFNamedEntitiesClaim = dfTvsF['NamedEntitiesClaim']
dfTvsFNamedEntitiesArticle = dfTvsF['NamedEntitiesArticle']
dfTvsFKeywords = dfTvsF['Keywords']
dfTvsFSource = dfTvsF['Source']

<div align="center">
    <h2>
        First attemp
    </h2>
</div>

## Make a copy of every column

In [715]:
dfTvsFHeadlineText1 = dfTvsFHeadlineText.copy()
dfTvsFAuthor1 = dfTvsFAuthor.copy()
dfTvsFNamedEntitiesClaim1 = dfTvsFNamedEntitiesClaim.copy()
dfTvsFNamedEntitiesArticle1 = dfTvsFNamedEntitiesArticle.copy()
dfTvsFKeywords1 = dfTvsFKeywords.copy()
dfTvsFSource1 = dfTvsFSource.copy()
dfTvsFRatingName1 = dfTvsFRatingName.copy()

## Transform data

In [716]:
dfTvsFHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFHeadlineText1), columns = ['HeadlineText'])

dfTvsFAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFAuthor1), columns = ['Author'])

dfTvsFNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfTvsFNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfTvsFKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFKeywords1), columns = ['Keywords'])

dfTvsFSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFSource1), columns = ['Source'])

dfTvsFRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFRatingName1), columns = ['RatingName'])

dfTvsF1 = pd.concat([dfTvsFHeadlineText1, dfTvsFAuthor1, dfTvsFNamedEntitiesClaim1, dfTvsFNamedEntitiesArticle1, dfTvsFKeywords1, dfTvsFSource1, dfTvsFRatingName1], axis = 1)

display(dfTvsF1.head())

## Saving the transformed data

In [717]:
dfTvsF1.to_csv('attemps/tf1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        Second attemp
    </h2>
</div>

## Make a copy of every column

In [718]:
dfTvsFHeadlineText2 = dfTvsFHeadlineText.copy()
dfTvsFAuthor2 = dfTvsFAuthor.copy()
dfTvsFNamedEntitiesClaim2 = dfTvsFNamedEntitiesClaim.copy()
dfTvsFNamedEntitiesArticle2 = dfTvsFNamedEntitiesArticle.copy()
dfTvsFKeywords2 = dfTvsFKeywords.copy()
dfTvsFSource2 = dfTvsFSource.copy()
dfTvsFRatingName2 = dfTvsFRatingName.copy()

## Transform data

In [719]:
dfTvsFHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfTvsFHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfTvsFHeadlineText2 = pd.DataFrame(minMaxScaler.fit_transform(dfTvsFHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfTvsFSource2 = pd.get_dummies(dfTvsFSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfTvsFRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFRatingName2), columns = ['RatingName'])

# Pas encore traité
dfTvsFAuthor2 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFAuthor2), columns = ['Author'])
dfTvsFAuthor2 = pd.DataFrame(minMaxScaler.fit_transform(dfTvsFAuthor2), columns = ['Author'])

dfTvsFNamedEntitiesClaim2 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFNamedEntitiesClaim2), columns = ['NamedEntitiesClaim'])
dfTvsFNamedEntitiesClaim2 = pd.DataFrame(minMaxScaler.fit_transform(dfTvsFNamedEntitiesClaim2), columns = ['NamedEntitiesClaim'])

dfTvsFNamedEntitiesArticle2 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])
dfTvsFNamedEntitiesArticle2 = pd.DataFrame(minMaxScaler.fit_transform(dfTvsFNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])

dfTvsFKeywords2 = pd.DataFrame(classLabelEncoder.fit_transform(dfTvsFKeywords2), columns = ['Keywords'])
dfTvsFKeywords2 = pd.DataFrame(minMaxScaler.fit_transform(dfTvsFKeywords2), columns = ['Keywords'])

dfTvsF2 = pd.concat([dfTvsFHeadlineText2, dfTvsFSource2, dfTvsFRatingName2], axis = 1)

display(dfTvsF2)

Unnamed: 0,05,10,100,1000,10000,100000,100k,101st,102000,106000,...,zimmerman,zipper,zippered,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,1
486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0


## Saving the transformed data

In [721]:
dfTvsF2.to_csv('attemps/tf2.csv', sep = ';', index = False)

<div align="center">
    <h1>
        TRUE/FALSE vs MIXTURE
    </h1>
</div>

In [722]:
dfTFvsM = df.copy()

# Rename TRUE FALSE BY NONMIXTURE
dfTFvsM['RatingName'].replace(to_replace = 'TRUE', value = 'NONMIXTURE', inplace = True)
dfTFvsM['RatingName'].replace(to_replace = 'FALSE', value = 'NONMIXTURE', inplace = True)

# Splitting the datafram
dfTFvsMHeadline = dfTFvsM['Headline']
dfTFvsMText = dfTFvsM['Text']
dfTFvsMRatingName = dfTFvsM['RatingName']
dfTFvsMAuthor = dfTFvsM['Author']
dfTFvsMNamedEntitiesClaim = dfTFvsM['NamedEntitiesClaim']
dfTFvsMNamedEntitiesArticle = dfTFvsM['NamedEntitiesArticle']
dfTFvsMKeywords = dfTFvsM['Keywords']
dfTFvsMSource = dfTFvsM['Source']

<div align="center">
    <h2>
        First attemp
    </h2>
</div>

## Make a copy of every column

In [723]:
dfTFvsMHeadline1 = dfTFvsMHeadline.copy()
dfTFvsMText1 = dfTFvsMText.copy()
dfTFvsMAuthor1 = dfTFvsMAuthor.copy()
dfTFvsMNamedEntitiesClaim1 = dfTFvsMNamedEntitiesClaim.copy()
dfTFvsMNamedEntitiesArticle1 = dfTFvsMNamedEntitiesArticle.copy()
dfTFvsMKeywords1 = dfTFvsMKeywords.copy()
dfTFvsMSource1 = dfTFvsMSource.copy()
dfTFvsMRatingName1 = dfTFvsMRatingName.copy()

## Transform data

In [724]:
dfTFvsMHeadline1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMHeadline1))
dfTFvsMText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMText1))
dfTFvsMAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMAuthor1))
dfTFvsMNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMNamedEntitiesClaim1))
dfTFvsMNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMNamedEntitiesArticle1))
dfTFvsMKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMKeywords1))
dfTFvsMSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMSource1))
dfTFvsMRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfTFvsMRatingName1))

#TODO HOW TO PUT NAMES
dfTFvsM1 = pd.concat([dfTFvsMHeadline1, dfTFvsMText1, dfTFvsMAuthor1, dfTFvsMNamedEntitiesClaim1, dfTFvsMNamedEntitiesArticle1, dfTFvsMKeywords1, dfTFvsMSource1, dfTFvsMRatingName1], axis = 1)

display(dfTFvsM1.head())

## Saving the transformed data

In [725]:
dfTFvsM1.to_csv('attemps/tfm1.csv', sep = ';', index = False)

<div class="alert alert-block alert-info" align="center">
    <h1>
        Classification
    </h1>
</div>

<div align="center">
    <h1>
        Preparing attemp 1 data for classification
    </h1>
</div>

## Reading the transformed data for the classification

In [727]:
dfClassification1 = pd.read_csv('attemps/tf2.csv', sep = ';')
display(dfClassification1.head())

Unnamed: 0,05,10,100,1000,10000,100000,100k,101st,102000,106000,...,zimmerman,zipper,zippered,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0


## Define the learning variables and the variable to predict

In [728]:
array1 = dfClassification1.values
X1 = array1[:,0:3447]
y1 = array1[:,3447]

## Cut the data set into a test set and a learning set

In [729]:
myTrainSize = 0.3 # 30% du jeu de données pour le test
myTestSize = 1 - myTrainSize # 70% du jeu de données pour l'entraînement
seed = 30

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, train_size = myTrainSize, random_state = seed, test_size = myTestSize)

<div align="center">
    <h1>
        Testing the first classifier on attemp 1
    </h1>
</div>

## GaussianNB classifier

In [730]:
clfGaussianNB = GaussianNB()

clfGaussianNB.fit(X_train1, y_train1)

resultGaussianNB = clfGaussianNB.predict(X_test1)

print(f'accuracy : {accuracy_score(resultGaussianNB, y_test1)}')

accuracy : 0.6491228070175439


## Display the confusion matrix and the classification report

In [731]:
print (f'Matrice de confusion :\n{confusion_matrix(y_test1, resultGaussianNB)}')
print (f'Classification report :\n{classification_report(y_test1, resultGaussianNB)}')

Matrice de confusion :
[[193  64]
 [ 56  29]]
Classification report :
              precision    recall  f1-score   support

         0.0       0.78      0.75      0.76       257
         1.0       0.31      0.34      0.33        85

    accuracy                           0.65       342
   macro avg       0.54      0.55      0.54       342
weighted avg       0.66      0.65      0.65       342



## Cross validate with 10 splits (Kfold)

In [732]:
seed = 7
myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)

## Apply the GaussianNB classifier and give the different accuracy for the 10 evaluations

In [733]:
clfGaussianNB = GaussianNB()

myScoring = 'accuracy'

score = cross_val_score(clfGaussianNB, X1, y1, cv = myKFold, scoring = myScoring)

print(f'Les différentes accuracy pour les 10 évaluations sont :\n{score}')
print(f'Accuracy moyenne : {score.mean()} | Standard deviation : {score.std()}')

Les différentes accuracy pour les 10 évaluations sont :
[0.53061224 0.59183673 0.59183673 0.69387755 0.73469388 0.67346939
 0.57142857 0.67346939 0.625      0.72916667]
Accuracy moyenne : 0.6415391156462585 | Standard deviation : 0.06600700606012395


<div align="center">
    <h1>
        Testing several classifiers
    </h1>
</div>

In [734]:
models = []

models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma = 'auto')))
models.append(('RFO', RandomForestClassifier()))
#models.append(('LR', LogisticRegression()))
#models.append(('LSVC', LinearSVC(max_iter = 3000)))
#models.append(('DTR', DecisionTreeRegressor()))

## Without shuffle

In [None]:
seed = 7
myScoring = 'accuracy'
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, random_state = seed)
    startTime = time.time()
    score = cross_val_score(model, X1, y1, cv = myKFold, scoring = myScoring)
    endTime = time.time()
    scores.append(score)
    names.append(name)
    print(f'{name}\t({score.mean()}\t| {score.std()} \t| Time : {endTime - startTime})\n')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## With shuffle

In [None]:
scores = []
names = []

for name, model in models:
    myKFold = KFold(n_splits = 10, shuffle = True, random_state = seed)
    startTime = time.time()
    score = cross_val_score(model, X1, y1, cv = myKFold, scoring = myScoring)
    endTime = time.time()
    scores.append(score)
    names.append(name)
    print(f'{name}\t({score.mean()}\t| {score.std()} | \tTime : {endTime - startTime})\n')

## Displaying results of the different classifiers

In [None]:
fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(scores)
ax.set_xticklabels(names)
plt.show()

## Apply GridSearchCV to RandomForestClassifier

In [None]:
gridParam = {'n_estimators': [4, 6, 9], 
             'max_features': ['log2', 'sqrt','auto'], 
             'criterion': ['entropy', 'gini'], 
             'max_depth': [2, 3, 5, 10], 
             'min_samples_split': [2, 3, 5], 
             'min_samples_leaf': [1, 5, 8]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = RandomForestClassifier(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = -1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Apply GridSearchCV to DecisionTreeClassifier

In [None]:
gridParam = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
             'criterion' : ['gini', 'entropy'], 
             'min_samples_leaf' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid = gridParam, scoring = myScoring, cv = 10, n_jobs = -1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)  

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Apply GridSearchCV to SVC

In [None]:
gridParam = {'C' : [0.001, 0.01, 0.1, 1, 10], 
             'gamma' : [0.001, 0.01, 0.1, 1], 
             'kernel' : ['linear', 'rbf']
            }

myScoring = 'accuracy'

clfGridSearchCV = GridSearchCV(estimator = SVC(), param_grid = gridParam, scoring = myScoring, cv = 5, n_jobs = 1, iid = True, return_train_score = True)

clfGridSearchCV.fit(X_train1, y_train1)

print(f'meilleur score : {clfGridSearchCV.best_score_}')
print(f'meilleurs paramètres :\n{clfGridSearchCV.best_params_}')
print(f'meilleur estimateur :\n{clfGridSearchCV.best_estimator_}')

## Do a gridsearch taking the previous parameters

In [None]:
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVM' : SVC()
}

params = {
    'RandomForestClassifier' : [
        {'n_estimators' : [9]}, 
        {'max_features' : ['auto']}, 
        {'criterion' : ['gini']}, 
        {'max_depth' : [10]}, 
        {'min_samples_split' : [3]}, 
        {'min_samples_leaf' : [1]}
    ], 
    'DecisionTreeClassifier' : [
        {'max_depth' : [2]}, 
        {'criterion' : ['entropy']}, 
        {'min_samples_leaf' : [7]}
    ],
    'SVM' : [
        {'C': [0.1]}, 
        {'gamma': [0.001]}, 
        {'kernel': ['linear']}
    ]
}

In [None]:
class Result:
    def __init__(self, name, score, parameters):
        self.name = name
        self.score = score
        self.parameters = parameters
    def __repr__(self):
        return repr((self.name, self.score, self.parameters))

results = []
myScoring = 'accuracy'

for key, value in classifiers.items():
    clfGridSearchCV = GridSearchCV(estimator = value, param_grid = params[key], scoring = myScoring, cv = 10, n_jobs = 1, iid = True)
    clfGridSearchCV.fit(X_train1, y_train1)
    result = Result(key, clfGridSearchCV.best_score_, clfGridSearchCV.best_estimator_)
    results.append(result)

results = sorted(results, key = lambda result: result.score, reverse = True)

print(f'Le meilleur resultat :')
print(f'\tClassifier : {results[0].name} | score : {results[0].score} | parameters :\n\t\t{results[0].parameters}')

print(f'Tous les résultats :')
for result in results:
    print(f'\n\tClassifier : {result.name} | score : {result.score} | parameters :\n\t\t{result.parameters}')

## Save the best learned model

In [None]:
filename = 'modeles/best.sav'
pickle.dump(results[0].parameters, open(filename, 'wb'))

## Reload the best model to test it with y_test

In [None]:
filename = 'modeles/best.sav'
clfLoaded = pickle.load(open(filename, 'rb'))
print(f'Modèle chargé :\n{clfLoaded}\n')

result = clfLoaded.predict(X_test1)

print(f'Accuracy : {accuracy_score(result, y_test1)}\n')
print(f'Matrice de confusion :\n{confusion_matrix(y_test1, result)}\n')
print(f'Classification report :\n{classification_report(y_test1, result)}')

In [None]:
dfTvsFUpSampling = dfTvsF.copy()
dfTvsFUpSampling['RatingName'].value_counts()

In [None]:
dfTvsFUpSampling_majority = dfTvsFUpSampling[dfTvsFUpSampling.RatingName == 'FALSE']
dfTvsFUpSampling_minority = dfTvsFUpSampling[dfTvsFUpSampling.RatingName == 'TRUE']
 
dfTvsFUpSampling_minority_upsampled = resample(dfTvsFUpSampling_minority, replace=True, n_samples=358, random_state=123) 

dfTvsF_upsampled = pd.concat([dfTvsFUpSampling_majority, dfTvsFUpSampling_minority_upsampled])
 
dfTvsF_upsampled['RatingName'].value_counts()
