<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Executing the basic
    </h1>
</div>

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import warnings
import nltk
import pickle
import unicodedata
import inflect
import re
import time
import contractions

from enum import Enum
from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

warnings.filterwarnings("ignore", category = FutureWarning)

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
    
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def separate_letter_number(words):
    new_words = []
    for word in words:
        nw = re.findall('\d+|\D+', word)
        new_words.append(nw)
    new_words = reduce(lambda x,y: x+y,new_words)
    return new_words

def replace_contractions(text):
    return 

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = separate_letter_number(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    tokens = word_tokenize(contractions.fix(text))
    tokens = normalize(tokens)
    text="".join([" "+i for i in tokens]).strip()
    return text

# Up-sampling & Down-sampling
def sampling(df_func, *args, **kwargs):
    sampling_type = kwargs.get("sample", None)
    if(sampling_type not in ['up', 'down']):
        print('Please select somthing in [\'up\', \'down\']')

    else:
        majority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[0]].reset_index(drop = True)
        minority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[-1]].reset_index(drop = True)

        if(sampling_type == 'up'):
            df_func = resample(minority, replace = True, n_samples = df_func['RatingName'].value_counts().tolist()[0], random_state = 123)
            df_func = pd.concat([majority, df_func]).sample(frac = 1).reset_index(drop = True)

        if(sampling_type == 'down'):
            df_func = resample(majority, replace=False, n_samples=df_func['RatingName'].value_counts().tolist()[-1], random_state=123) 
            df_func = pd.concat([df_func, minority]).reset_index(drop = True)

        return df_func

<div class="alert alert-block alert-info" align="center">
    <h1>
        Reading data
    </h1>
</div>

In [2]:
df = pd.read_csv('datasets/generated.csv', sep = ',')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [3]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,Says Marco Rubio 'skipped 18 defense votes inc...,MIXTURE,Keep the Promise 1 PAC,Pro-Ted Cruz PAC says Marco Rubio skipped 18 d...,"American Enterprise Institute,Ash Carter,Barac...","ISIS,Kurds,Marco Rubio","Military,Voting Record",politifact
1,Does The US Have Trade Deficits With ‘Almost A...,OTHER,Unknown,Does The US Have Trade Deficits With ‘Almost A...,"Alan Deardorff,Brazil,Bureau of Economic Analy...",,,checkyourfact
2,"'There is great disparity, tremendous disparit...",MIXTURE,Janet Adkins,Traditional schools get much more capital fund...,"Education, Florida,Erik Fresen,Florida Departm...",,"Education,State Budget,Taxes",politifact
3,Target.com or Target blocked Israel from acces...,MIXTURE,Unknown,Target Blocks Israel from E-Commerce Access?,"Chabin,Forever 21,Israel,Jerusalem,Old Navy,Ta...","Israel,Target.com","israel, onlysimchas, target, target.com",snopes
4,"Each refugee in Canada receives $3,874 every m...",MIXTURE,Unknown,"Do ‘Illegal’ Refugees Receive $3,874 Per Month...","British Columbia,Canadian Council for Refugees...",,,snopes


## Remove unnecessary rows

In [4]:
# Deleting claims with OTHER é MIXTURE RatingName
df = df[df.RatingName != 'OTHER']
df = df[df.RatingName != 'MIXTURE']

display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
5,U.S. tariffs on China are 'not hurting anybody...,False,Peter Navarro,Donald Trump's tariffs on China don't hurt Ame...,"Boise State University,CNN,Caterpillar,Coca-Co...",,"Taxes,Trade",politifact
7,Chick-fil-A provided free food to motorists st...,True,Unknown,Did Chick-fil-A Give Free Food to Motorists St...,"ABC,ABC News,AL.com,Birmingham, Alabama,Chick-...","Birmingham, Alabama,Chick-fil-A",,snopes
8,'State Department officials actually directed ...,False,Mike Pence,Mike Pence wrong on Haiti contracts steered to...,"ABC,ABC News,Bill Clinton,Breitbart website,Ce...",,Ethics,politifact
9,Says Barack 'Obama admits he’s coming for our ...,False,National Rifle Association,"Barack Obama coming after guns, 'under the rad...","Barack Obama,Brady Campaign to Prevent Gun Vio...",,Guns,politifact
11,"'In Mexico, they don't have birth certificates...",False,Pete Gallego,Rep. Pete Gallego says Mexicans have one natio...,"Bolivia,Federal Electoral Institute,Haiti,Leo ...",,Immigration,politifact


## Replacing "Unknown" & NaN by "Inconnue"

In [5]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
5,U.S. tariffs on China are 'not hurting anybody...,False,Peter Navarro,Donald Trump's tariffs on China don't hurt Ame...,"Boise State University,CNN,Caterpillar,Coca-Co...",Inconnue,"Taxes,Trade",politifact
7,Chick-fil-A provided free food to motorists st...,True,Inconnue,Did Chick-fil-A Give Free Food to Motorists St...,"ABC,ABC News,AL.com,Birmingham, Alabama,Chick-...","Birmingham, Alabama,Chick-fil-A",Inconnue,snopes
8,'State Department officials actually directed ...,False,Mike Pence,Mike Pence wrong on Haiti contracts steered to...,"ABC,ABC News,Bill Clinton,Breitbart website,Ce...",Inconnue,Ethics,politifact
9,Says Barack 'Obama admits he’s coming for our ...,False,National Rifle Association,"Barack Obama coming after guns, 'under the rad...","Barack Obama,Brady Campaign to Prevent Gun Vio...",Inconnue,Guns,politifact
11,"'In Mexico, they don't have birth certificates...",False,Pete Gallego,Rep. Pete Gallego says Mexicans have one natio...,"Bolivia,Federal Electoral Institute,Haiti,Leo ...",Inconnue,Immigration,politifact


## Cleaning

In [6]:
dfClean = df.copy()
for column in dfClean.columns:
    dfClean[column] = dfClean[column].apply(lambda x: clean_text(x))

display(dfClean.head())
display(dfClean.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
5,yous tariffs china hurting anybody united states,False,peter navarro,donald trump tariffs china hurt americans top ...,boise state university cnn caterpillar cocacol...,inconnue,taxes trade,politifact
7,chickfila provided free food motorists strande...,True,inconnue,chickfila give free food motorists stranded sn...,abc abc news alcom birmingham alabama chickfil...,birmingham alabama chickfila,inconnue,snopes
8,state department officials actually directed c...,False,mike pence,mike pence wrong haiti contracts steered clint...,abc abc news bill clinton breitbart website ce...,inconnue,ethics,politifact
9,says barack obama admits coming guns telling s...,False,national rifle association,barack obama coming guns radar nra says,barack obama brady campaign prevent gun violen...,inconnue,guns,politifact
11,mexico birth certificates registration cards v...,False,pete gallego,rep pete gallego says mexicans one national id...,bolivia federal electoral institute haiti leo ...,inconnue,immigration,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,495,495,495,495,495,495,495,495
unique,495,2,169,495,495,301,352,4
top,innercity students fall back succeed whereas l...,false,inconnue,hiker finds creepy shrine missing persons new ...,aclu allen mcneill american civil liberties un...,inconnue,inconnue,snopes
freq,1,357,263,1,1,165,33,241


# Upsampling

In [7]:
dfCleanUpsample = sampling(dfClean, sample = 'up')
display(dfCleanUpsample.head())
display(dfCleanUpsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,says federal gas tax raised since one thousand...,True,george voinovich,sen george voinovich suggests twentyfive cent ...,barack obama bill clinton congressional budget...,federal gas tax gas tax,infrastructure jobs taxes transportation,politifact
1,private healthcare plans must conform governme...,True,chain email,health care exchange government way regulating...,kaiser family foundation argues batpeople chai...,inconnue,health care regulation,politifact
2,following two thousand and nine session texas ...,True,heidi group,activists say planned parenthood nation premie...,american life league guttmacher institute joe ...,joe straus planned parenthood texas house spea...,abortion,politifact
3,two thousand and seven film golden compass bas...,True,inconnue,golden compass,amber spyglass atheist big story bill donohue ...,golden compass,asp article,snopes
4,palin supports aerial hunting wolves wildlife,True,defenders wildlife action fund,palin supports aerial shooting reason,alaska alaska department fish game defenders w...,inconnue,environment,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,714,714,714,714,714,714,714,714
unique,486,2,163,486,486,296,346,4
top,says state budget includes spending commercial...,false,inconnue,state rep david simpson says state budget incl...,action figure david simpson friday night light...,inconnue,inconnue,politifact
freq,11,357,351,11,11,265,45,363


# Downsampling

In [8]:
dfCleanDownsample = sampling(dfClean, sample = 'down')
display(dfCleanDownsample.head())
display(dfCleanDownsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,photograph shows cow crusher device designed l...,False,inconnue,image cow crusher device designed crush cows,antifa fistulated cow meat livestock australia...,inconnue,cattle crush cow crush cows facebook farming,snopes
1,expectant mother produces growing cascade guff...,False,inconnue,pregnant bus rider,doublemint twins dunlop rubber liniment internet,inconnue,asp article gallery gruesome,snopes
2,scientist missing twenty years found living se...,False,inconnue,scientist missing twenty years found living in...,cottage grove minnesota facebook gary sandford...,inconnue,necessarily news,snopes
3,sa whites really killed like flies steve hofme...,False,inconnue,sa whites really killed like flies steve hofme...,abc africa check afrikaans akismet angie motsh...,steve hofmeyr,crime government murder police race,africacheck
4,body homeless man found behind jamaica fast fo...,False,inconnue,body homeless man found fast food restaurant b...,bob marley bob marley elvis presley hawaii jam...,bob marley jamaica,bob marley empire news necessarily news richmo...,snopes


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,276,276,276,276,276,276,276,276
unique,276,2,119,276,276,165,210,4
top,latinos seventeen percent country population h...,true,inconnue,rep michele bachmann claims obama trip india c...,adweek breitbartcom cnn daily caller fox news ...,inconnue,inconnue,politifact
freq,1,138,131,1,1,96,22,145


At this point we have df, dfClean, dfCleanUpsample and dfCleanedDownsample

<div class="alert alert-block alert-info" align="center">
    <h1>
        Encodage
    </h1>
</div>

In [9]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer = TfidfVectorizer()

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

# How to use
#df = pd.DataFrame(standardScaler.fit_transform(df), columns = ['Name'])

# Splitting the dataframe

In [10]:
# df
dfHeadlineText = df["Headline"] + " " + df["Text"]
dfRatingName = df['RatingName']
dfAuthor = df['Author']
dfNamedEntitiesClaim = df['NamedEntitiesClaim']
dfNamedEntitiesArticle = df['NamedEntitiesArticle']
dfKeywords = df['Keywords']
dfSource = df['Source']

# dfClean
dfCleanHeadlineText = dfClean["Headline"] + " " + dfClean["Text"]
dfCleanRatingName = dfClean['RatingName']
dfCleanAuthor = dfClean['Author']
dfCleanNamedEntitiesClaim = dfClean['NamedEntitiesClaim']
dfCleanNamedEntitiesArticle = dfClean['NamedEntitiesArticle']
dfCleanKeywords = dfClean['Keywords']
dfCleanSource = dfClean['Source']

# dfCleanUpsample
dfCleanUpsampleHeadlineText = dfCleanUpsample["Headline"] + " " + dfCleanUpsample["Text"]
dfCleanUpsampleRatingName = dfCleanUpsample['RatingName']
dfCleanUpsampleAuthor = dfCleanUpsample['Author']
dfCleanUpsampleNamedEntitiesClaim = dfCleanUpsample['NamedEntitiesClaim']
dfCleanUpsampleNamedEntitiesArticle = dfCleanUpsample['NamedEntitiesArticle']
dfCleanUpsampleKeywords = dfCleanUpsample['Keywords']
dfCleanUpsampleSource = dfCleanUpsample['Source']

# dfCleanDownsample
dfCleanDownsampleHeadlineText = dfCleanDownsample["Headline"] + " " + dfCleanDownsample["Text"]
dfCleanDownsampleRatingName = dfCleanDownsample['RatingName']
dfCleanDownsampleAuthor = dfCleanDownsample['Author']
dfCleanDownsampleNamedEntitiesClaim = dfCleanDownsample['NamedEntitiesClaim']
dfCleanDownsampleNamedEntitiesArticle = dfCleanDownsample['NamedEntitiesArticle']
dfCleanDownsampleKeywords = dfCleanDownsample['Keywords']
dfCleanDownsampleSource = dfCleanDownsample['Source']

<div align="center">
    <h2>
        TF 1
    </h2>
</div>

## Make a copy of every column

In [11]:
dfHeadlineText1 = dfHeadlineText.copy()
dfAuthor1 = dfAuthor.copy()
dfNamedEntitiesClaim1 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle1 = dfNamedEntitiesArticle.copy()
dfKeywords1 = dfKeywords.copy()
dfSource1 = dfSource.copy()
dfRatingName1 = dfRatingName.copy()

## Transform data

In [12]:
dfHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])
dfHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])

dfAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfAuthor1), columns = ['Author'])
dfAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfAuthor1), columns = ['Author'])

dfNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfKeywords1), columns = ['Keywords'])
dfKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfKeywords1), columns = ['Keywords'])

dfSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfSource1), columns = ['Source'])
dfSource1 = pd.DataFrame(standardScaler.fit_transform(dfSource1), columns = ['Source'])

dfRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName1), columns = ['RatingName'])

df1 = pd.concat([dfHeadlineText1, dfAuthor1, dfNamedEntitiesClaim1, dfNamedEntitiesArticle1, dfKeywords1, dfSource1, dfRatingName1], axis = 1)

display(df1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-0.664829,1.385080,0.440887,-0.149119,0.733593,-0.864115,0
1,-1.091719,-0.180520,-1.385643,-1.393092,0.390171,0.853704,1
2,0.314919,1.060145,-1.378645,-0.149119,-0.023956,-0.864115,0
3,-1.553600,1.178303,0.000000,-0.149119,0.167957,-0.864115,0
4,0.804793,1.355541,0.447885,-0.149119,0.349768,-0.864115,0
...,...,...,...,...,...,...,...
490,0.916764,1.887254,0.601845,-0.149119,-0.539089,-0.864115,0
491,-0.090977,-0.180520,1.315661,1.877579,0.935607,0.853704,0
492,1.385643,0.823827,-1.119712,-0.149119,0.218460,-0.864115,1
493,-1.301665,-1.539343,0.433888,-0.149119,0.188158,-0.864115,1


## Saving the transformed data

In [13]:
df1.to_csv('attemps/tf1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 1
    </h2>
</div>

## Make a copy of every column

In [14]:
dfCleanHeadlineText1 = dfCleanHeadlineText.copy()
dfCleanAuthor1 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim1 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle1 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords1 = dfCleanKeywords.copy()
dfCleanSource1 = dfCleanSource.copy()
dfCleanRatingName1 = dfCleanRatingName.copy()

## Transform data

In [15]:
dfCleanHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])
dfCleanHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])

dfCleanAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanAuthor1), columns = ['Author'])
dfCleanAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanAuthor1), columns = ['Author'])

dfCleanNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanKeywords1), columns = ['Keywords'])
dfCleanKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanKeywords1), columns = ['Keywords'])

dfCleanSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanSource1), columns = ['Source'])
dfCleanSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanSource1), columns = ['Source'])

dfCleanRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName1), columns = ['RatingName'])

dfClean1 = pd.concat([dfCleanHeadlineText1, dfCleanAuthor1, dfCleanNamedEntitiesClaim1, dfCleanNamedEntitiesArticle1, dfCleanKeywords1, dfCleanSource1, dfCleanRatingName1], axis = 1)

display(dfClean1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-1.000743,1.386030,0.167957,0.052302,1.543219,-0.864115,0
1,-1.350652,-0.181725,-1.693564,-1.494608,1.050054,0.853704,1
2,0.055986,1.060647,-1.686566,0.052302,0.499986,-0.864115,0
3,-1.630580,1.178968,-0.286926,0.052302,0.803472,-0.864115,0
4,0.734811,1.356450,0.174955,0.052302,1.012118,-0.864115,0
...,...,...,...,...,...,...,...
490,0.888771,1.888895,0.328915,0.052302,-0.201826,-0.864115,0
491,-0.496872,-0.181725,1.056728,-0.116451,-1.444222,0.853704,0
492,1.490616,0.824005,-1.434631,0.052302,0.860375,-0.864115,1
493,-1.343654,-1.512837,0.160959,0.052302,0.831924,-0.864115,1


## Saving the transformed data

In [16]:
dfClean1.to_csv('attemps/tfclean1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 1
    </h2>
</div>

## Make a copy of every column

In [17]:
dfCleanUpsampleHeadlineText1 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor1 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim1 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle1 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords1 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource1 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName1 = dfCleanUpsampleRatingName.copy()

## Transform data

In [18]:
dfCleanUpsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanUpsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanUpsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])
dfCleanUpsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])

dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanUpsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])
dfCleanUpsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])

dfCleanUpsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])
dfCleanUpsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])

dfCleanUpsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName1), columns = ['RatingName'])

dfCleanUpsample1 = pd.concat([dfCleanUpsampleHeadlineText1, dfCleanUpsampleAuthor1, dfCleanUpsampleNamedEntitiesClaim1, dfCleanUpsampleNamedEntitiesArticle1, dfCleanUpsampleKeywords1, dfCleanUpsampleSource1, dfCleanUpsampleRatingName1], axis = 1)

display(dfCleanUpsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.985476,-0.461180,-0.319941,-0.481392,1.034654,-0.815284,1
1,-0.534228,-1.312038,1.358444,0.065382,0.888500,-0.815284,1
2,-1.703769,-0.278853,-0.917333,0.331380,-1.713052,-0.815284,1
3,-0.667289,-0.126914,-1.009786,-0.185838,-1.459717,0.917195,1
4,0.362187,-0.916997,-1.130687,0.065382,0.489011,-0.815284,1
...,...,...,...,...,...,...,...
709,-1.549698,-1.038548,1.692698,0.065382,-0.767919,-0.815284,1
710,-0.429179,-0.126914,0.419686,0.065382,-0.738688,0.917195,1
711,-0.765335,-0.126914,1.180649,-0.363170,-1.410999,0.917195,1
712,1.195573,-0.126914,0.903288,0.065382,-1.011510,0.917195,0


## Saving the transformed data

In [19]:
dfCleanUpsample1.to_csv('attemps/tfcleanupsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 1
    </h2>
</div>

## Make a copy of every column

In [20]:
dfCleanDownsampleHeadlineText1 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor1 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim1 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle1 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords1 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource1 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName1 = dfCleanDownsampleRatingName.copy()

## Transform data

In [21]:
dfCleanDownsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanDownsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanDownsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])
dfCleanDownsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])
dfCleanDownsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])

dfCleanDownsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])
dfCleanDownsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])

dfCleanDownsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName1), columns = ['RatingName'])

dfCleanDownsample1 = pd.concat([dfCleanDownsampleHeadlineText1, dfCleanDownsampleAuthor1, dfCleanDownsampleNamedEntitiesClaim1, dfCleanDownsampleNamedEntitiesArticle1, dfCleanDownsampleKeywords1, dfCleanDownsampleSource1, dfCleanDownsampleRatingName1], axis = 1)

display(dfCleanDownsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-0.382811,-0.160535,-0.759346,0.133260,-0.675574,0.881727,0
1,0.395362,-0.160535,0.897409,0.133260,-1.319924,0.881727,0
2,0.847204,-0.160535,0.709141,0.133260,1.257473,0.881727,0
3,0.797000,-0.160535,-1.663031,1.697242,-0.240639,-2.429247,0
4,-1.537519,-0.160535,0.094134,-1.508922,-0.981640,0.881727,0
...,...,...,...,...,...,...,...
271,-1.512417,0.249341,-0.194543,0.133260,0.145971,-0.773760,1
272,0.257299,-0.160535,-0.960165,-1.039727,-0.949423,0.881727,1
273,0.985267,-1.308189,-0.018827,0.133260,0.290949,-0.773760,1
274,1.474763,0.782181,-1.424558,0.133260,0.806429,-0.773760,1


## Saving the transformed data

In [22]:
dfCleanDownsample1.to_csv('attemps/tfcleandownsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF2
    </h2>
</div>

## Make a copy of every column

In [23]:
dfHeadlineText2 = dfHeadlineText.copy()
dfAuthor2 = dfAuthor.copy()
dfNamedEntitiesClaim2 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle2 = dfNamedEntitiesArticle.copy()
dfKeywords2 = dfKeywords.copy()
dfSource2 = dfSource.copy()
dfRatingName2 = dfRatingName.copy()

## Transform data

In [24]:
dfHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfSource2 = pd.get_dummies(dfSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName2), columns = ['RatingName'])

df2 = pd.concat([dfHeadlineText2, dfSource2, dfRatingName2], axis = 1)

display(df2)

Unnamed: 0,00,000,10,100,104,106,11,115,12,13,...,youth,yvette,zambian,zero,zone,Source_africacheck,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,0
1,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,0,1,0,1
2,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,0
3,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,0
4,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,0
491,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,0,1,0,0
492,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,1
493,-0.044992,-0.174046,-0.088078,-0.044992,-0.044992,-0.044992,-0.085543,-0.044992,-0.044992,-0.102472,...,-0.044992,-0.044992,-0.044992,-0.063670,-0.044992,0,1,0,0,1


## Saving the transformed data

In [25]:
df2.to_csv('attemps/tf2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 2
    </h2>
</div>

## Make a copy of every column

In [26]:
dfCleanHeadlineText2 = dfCleanHeadlineText.copy()
dfCleanAuthor2 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim2 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle2 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords2 = dfCleanKeywords.copy()
dfCleanSource2 = dfCleanSource.copy()
dfCleanRatingName2 = dfCleanRatingName.copy()

## Transform data

In [27]:
dfCleanHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanSource2 = pd.get_dummies(dfCleanSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName2), columns = ['RatingName'])

dfClean2 = pd.concat([dfCleanHeadlineText2, dfCleanSource2, dfCleanRatingName2], axis = 1)

display(dfClean2)

Unnamed: 0,abbott,abduction,abdul,abe,abele,able,abortion,abrams,abruin,absurd,...,youth,yvette,zambian,zero,zone,Source_africacheck,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,0
1,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,0,1,0,1
2,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,0
3,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,0
4,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,0
491,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,0,1,0,0
492,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,1
493,-0.063621,-0.044992,-0.044992,-0.044992,-0.044992,-0.044992,-0.062887,-0.044992,-0.044992,-0.044992,...,-0.044992,-0.044992,-0.044992,-0.190654,-0.044992,0,1,0,0,1


## Saving the transformed data

In [28]:
dfClean2.to_csv('attemps/tfclean2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 2
    </h2>
</div>

## Make a copy of every column

In [29]:
dfCleanUpsampleHeadlineText2 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor2 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim2 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle2 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords2 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource2 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName2 = dfCleanUpsampleRatingName.copy()

## Transform data

In [30]:
dfCleanUpsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanUpsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanUpsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanUpsampleSource2 = pd.get_dummies(dfCleanUpsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanUpsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName2), columns = ['RatingName'])

dfCleanUpsample2 = pd.concat([dfCleanUpsampleHeadlineText2, dfCleanUpsampleSource2, dfCleanUpsampleRatingName2], axis = 1)

display(dfCleanUpsample2)

Unnamed: 0,abbott,abduction,abdul,abe,abele,able,abortion,abrams,abruin,absurd,...,yous,youth,zambian,zero,zone,Source_africacheck,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,1,0,0,1
1,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,1,0,0,1
2,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,11.692356,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,1,0,0,1
3,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,0,1,0,1
4,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,1,0,0,1
710,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,0,1,0,1
711,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,0,1,0,1
712,-0.064928,-0.03745,-0.03745,-0.03745,-0.03745,-0.03745,-0.098368,-0.03745,-0.053,-0.03745,...,-0.252009,-0.064957,-0.03745,-0.187969,-0.03745,0,0,1,0,0


## Saving the transformed data

In [31]:
dfCleanUpsample2.to_csv('attemps/tfcleanupsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 2
    </h2>
</div>

## Make a copy of every column

In [32]:
dfCleanDownsampleHeadlineText2 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor2 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim2 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle2 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords2 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource2 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName2 = dfCleanDownsampleRatingName.copy()

## Transform data

In [33]:
dfCleanDownsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleSource2 = pd.get_dummies(dfCleanDownsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanDownsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName2), columns = ['RatingName'])

# Pas encore traité
dfCleanDownsampleAuthor2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])
dfCleanDownsampleAuthor2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleNamedEntitiesClaim2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])
dfCleanDownsampleKeywords2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])

dfCleanDownsample2 = pd.concat([dfCleanDownsampleHeadlineText2, dfCleanDownsampleSource2, dfCleanDownsampleRatingName2], axis = 1)

display(dfCleanDownsample2)

Unnamed: 0,abbott,abe,able,abortion,abruin,abuse,academy,accept,accepted,accepts,...,youn,yous,youth,yvette,zero,Source_africacheck,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,0,1,0,0
1,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,0,1,0,0
2,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,0,1,0,0
3,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,1,0,0,0,0
4,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,1,0,0,1
272,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,0,1,0,1
273,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,1,0,0,1
274,-0.085331,-0.060302,-0.060302,-0.084286,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,-0.060302,...,-0.060302,-0.268617,-0.060302,-0.060302,-0.20186,0,1,0,0,1


## Saving the transformed data

In [34]:
dfCleanDownsample2.to_csv('attemps/tfcleandownsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF3
    </h2>
</div>

## Make a copy of every column

In [35]:
dfHeadlineText3 = dfHeadlineText.copy()
dfAuthor3 = dfAuthor.copy()
dfNamedEntitiesClaim3 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle3 = dfNamedEntitiesArticle.copy()
dfKeywords3 = dfKeywords.copy()
dfSource3 = dfSource.copy()
dfRatingName3 = dfRatingName.copy()

## Transform data

In [36]:
dfHeadlineText3 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(tfidfTransformer1.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText3), columns = tfidfVectorizer.get_feature_names())

dfSource3 = pd.get_dummies(dfSource3, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName3 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName3), columns = ['RatingName'])

df3 = pd.concat([dfHeadlineText3, dfSource3, dfRatingName3], axis = 1)

display(df3)

Unnamed: 0,00,000,10,100,104,106,11,115,12,13,...,youth,yvette,zambian,zero,zone,Source_africacheck,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,0
1,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,0,1,0,1
2,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,0
3,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,0
4,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,0
491,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,0,1,0,0
492,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,1
493,-0.044992,-0.174764,-0.08789,-0.044992,-0.044992,-0.044992,-0.08509,-0.044992,-0.044992,-0.101567,...,-0.044992,-0.044992,-0.044992,-0.063693,-0.044992,0,1,0,0,1


## Saving the transformed data

In [37]:
df3.to_csv('attemps/tf3.csv', sep = ';', index = False)