<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Executing the basic
    </h1>
</div>

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import inflect
import re
#import contractions

#from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def separate_letter_number(words):
    new_words = []
    for word in words:
        nw = re.findall('\d+|\D+', word)
        new_words.append(nw)
    new_words = reduce(lambda x,y: x+y,new_words)
    return new_words

def replace_contractions(text):
    return 

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    #words = separate_letter_number(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    #tokens = word_tokenize(contractions.fix(text))
    tokens = word_tokenize(text)
    tokens = normalize(tokens)
    text="".join([" "+i for i in tokens]).strip()
    return text

# Up-sampling & Down-sampling
def sampling(df_func, *args, **kwargs):
    sampling_type = kwargs.get("sample", None)
    if(sampling_type not in ['up', 'down']):
        print('Please select somthing in [\'up\', \'down\']')

    else:
        majority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[0]].reset_index(drop = True)
        minority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[-1]].reset_index(drop = True)

        if(sampling_type == 'up'):
            df_func = resample(minority, replace = True, n_samples = df_func['RatingName'].value_counts().tolist()[0], random_state = 123)
            df_func = pd.concat([majority, df_func]).sample(frac = 1).reset_index(drop = True)

        if(sampling_type == 'down'):
            df_func = resample(majority, replace=False, n_samples=df_func['RatingName'].value_counts().tolist()[-1], random_state=123) 
            df_func = pd.concat([df_func, minority]).reset_index(drop = True)

        return df_func

<div class="alert alert-block alert-info" align="center">
    <h1>
        Reading data
    </h1>
</div>

In [2]:
df = pd.read_csv('datasets/generated.csv', sep = ';')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [3]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,Malia Obama cashed a $1.2 million tax refund c...,OTHER,Unknown,Did Malia Obama Cash a $1.2 Million Check?,"Facebook,Fan Fiction,Junk News,Malia Obama,Sno...",Malia Obama,,truthorfiction
1,High diver is saved from jumping into a draine...,OTHER,Unknown,High Diver Saved By Cross,"Cincinnati Post,Islam,Scripture lesson,Univers...",shadow on the wall,ASP Article,snopes
2,'And the revenue generated by drilling off Vir...,MIXTURE,Jim Moran,Moran says drilling off Virginia's coast will ...,"Alaska,American Petroleum Institute,Atlantic O...",,"Energy,State Finances",politifact
3,Health insurance companies pay CEOs $24 millio...,MIXTURE,Health Care for America Now,Health care advocacy group blasts insurers for...,"Aetna,Assurant,Bloomberg News,Cigna,Coventry H...",,"Corporations,Health Care",politifact
4,Ted Cruz said that veterans should start selli...,FALSE,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes


## Remove unnecessary rows

In [4]:
# Deleting claims with OTHER é MIXTURE RatingName
df = df[df.RatingName != 'OTHER']

display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,215,215,215,215,214,135,206,215
unique,215,3,110,215,214,129,180,5
top,'22 veterans take their own lives each day.',MIXTURE,Unknown,Sherrod Brown says elderly account for two-thi...,"Afghanistan,Agence France Press,Agence France ...",Donald Trump,ASP Article,politifact
freq,1,96,84,1,1,3,7,130


## Replacing "Unknown" & NaN by "Inconnue"

In [5]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,215,215,215,215,215,215,215,215
unique,215,3,110,215,215,130,181,5
top,'22 veterans take their own lives each day.',MIXTURE,Inconnue,Sherrod Brown says elderly account for two-thi...,"Afghanistan,Agence France Press,Agence France ...",Inconnue,Inconnue,politifact
freq,1,96,84,1,1,80,9,130


# Rename TRUE FALSE BY NONMIXTURE

In [6]:
df['RatingName'].replace(to_replace = 'TRUE', value = 'nonmixture', inplace = True)
df['RatingName'].replace(to_replace = 'FALSE', value = 'nonmixture', inplace = True)

display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,215,215,215,215,215,215,215,215
unique,215,2,110,215,215,130,181,5
top,'22 veterans take their own lives each day.',nonmixture,Inconnue,Sherrod Brown says elderly account for two-thi...,"Afghanistan,Agence France Press,Agence France ...",Inconnue,Inconnue,politifact
freq,1,119,84,1,1,80,9,130


## Cleaning

In [7]:
dfClean = df.copy()
for column in dfClean.columns:
    dfClean[column] = dfClean[column].apply(lambda x: clean_text(x))

display(dfClean.head())
display(dfClean.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
2,revenue generated drilling virginia coast fort...,mixture,jim moran,moran says drilling virginia coast net forty m...,alaska american petroleum institute atlantic o...,inconnue,energy state finances,politifact
3,health insurance companies pay ceos twentyfour...,mixture,health care america,health care advocacy group blasts insurers ceo...,aetna assurant bloomberg news cigna coventry h...,inconnue,corporations health care,politifact
4,ted cruz said veterans start selling cookies o...,nonmixture,inconnue,ted cruz vets sell cookies funding like girl s...,david nelson james morrison john scalzi republ...,ted cruz,asp article necessarily news,snopes
5,passengers airliner diverted cuba thought hija...,mixture,inconnue,airline passengers mistake hijacking candid ca...,allen funt associated press cbs candid camera ...,allen funt candid camera cuba,broadcast legends radio tv television,snopes
6,oil money federal leases used clean mess damag...,mixture,bill nelson,sen bill nelson says offshore drilling wo nt p...,alabama bill nelson dfla florida panhandle gul...,inconnue,environment,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,215,215,215,215,215,215,215,215
unique,215,2,110,215,215,130,181,5
top,pepper tax finally voted pepper laid cops clos...,nonmixture,inconnue,petition save nea,facebook hiv hiv virus new york city new york ...,inconnue,inconnue,politifact
freq,1,119,84,1,1,80,9,130


# Upsampling

In [8]:
dfCleanUpsample = sampling(dfClean, sample = 'up')
display(dfCleanUpsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,238,238,238,238,238,238,238,238
unique,191,2,95,191,191,115,160,5
top,louisiana film incentives program big actually...,nonmixture,inconnue,sen bill nelson says offshore drilling wo nt p...,barack obama cspan cbo congressional budget of...,inconnue,inconnue,politifact
freq,4,119,87,4,4,90,9,150


# Downsampling

In [9]:
dfCleanDownsample = sampling(dfClean, sample = 'down')
display(dfCleanDownsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,192,192,192,192,192,192,192,192
unique,192,2,103,192,192,119,165,5
top,says special counsel robert mueller brought de...,nonmixture,inconnue,evidence support rumors hate muslims chant tru...,alabama bill nelson dfla florida panhandle gul...,inconnue,asp article,politifact
freq,1,96,70,1,1,68,7,121


At this point we have df, dfClean, dfCleanUpsample and dfCleanedDownsample

<div class="alert alert-block alert-info" align="center">
    <h1>
        Encodage
    </h1>
</div>

In [10]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer = TfidfVectorizer()

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

# How to use
#df = pd.DataFrame(standardScaler.fit_transform(df), columns = ['Name'])

# Splitting the dataframe

In [11]:
# df
dfHeadlineText = df["Headline"] + " " + df["Text"]
dfRatingName = df['RatingName']
dfAuthor = df['Author']
dfNamedEntitiesClaim = df['NamedEntitiesClaim']
dfNamedEntitiesArticle = df['NamedEntitiesArticle']
dfKeywords = df['Keywords']
dfSource = df['Source']

# dfClean
dfCleanHeadlineText = dfClean["Headline"] + " " + dfClean["Text"]
dfCleanRatingName = dfClean['RatingName']
dfCleanAuthor = dfClean['Author']
dfCleanNamedEntitiesClaim = dfClean['NamedEntitiesClaim']
dfCleanNamedEntitiesArticle = dfClean['NamedEntitiesArticle']
dfCleanKeywords = dfClean['Keywords']
dfCleanSource = dfClean['Source']

# dfCleanUpsample
dfCleanUpsampleHeadlineText = dfCleanUpsample["Headline"] + " " + dfCleanUpsample["Text"]
dfCleanUpsampleRatingName = dfCleanUpsample['RatingName']
dfCleanUpsampleAuthor = dfCleanUpsample['Author']
dfCleanUpsampleNamedEntitiesClaim = dfCleanUpsample['NamedEntitiesClaim']
dfCleanUpsampleNamedEntitiesArticle = dfCleanUpsample['NamedEntitiesArticle']
dfCleanUpsampleKeywords = dfCleanUpsample['Keywords']
dfCleanUpsampleSource = dfCleanUpsample['Source']

# dfCleanDownsample
dfCleanDownsampleHeadlineText = dfCleanDownsample["Headline"] + " " + dfCleanDownsample["Text"]
dfCleanDownsampleRatingName = dfCleanDownsample['RatingName']
dfCleanDownsampleAuthor = dfCleanDownsample['Author']
dfCleanDownsampleNamedEntitiesClaim = dfCleanDownsample['NamedEntitiesClaim']
dfCleanDownsampleNamedEntitiesArticle = dfCleanDownsample['NamedEntitiesArticle']
dfCleanDownsampleKeywords = dfCleanDownsample['Keywords']
dfCleanDownsampleSource = dfCleanDownsample['Source']

<div align="center">
    <h2>
        TF 1
    </h2>
</div>

## Make a copy of every column

In [12]:
dfHeadlineText1 = dfHeadlineText.copy()
dfAuthor1 = dfAuthor.copy()
dfNamedEntitiesClaim1 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle1 = dfNamedEntitiesArticle.copy()
dfKeywords1 = dfKeywords.copy()
dfSource1 = dfSource.copy()
dfRatingName1 = dfRatingName.copy()

## Transform data

In [13]:
dfHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])
dfHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])

dfAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfAuthor1), columns = ['Author'])
dfAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfAuthor1), columns = ['Author'])

dfNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfKeywords1), columns = ['Keywords'])
dfKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfKeywords1), columns = ['Keywords'])

dfSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfSource1), columns = ['Source'])
dfSource1 = pd.DataFrame(standardScaler.fit_transform(dfSource1), columns = ['Source'])

dfRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName1), columns = ['RatingName'])

df1 = pd.concat([dfHeadlineText1, dfAuthor1, dfNamedEntitiesClaim1, dfNamedEntitiesArticle1, dfKeywords1, dfSource1, dfRatingName1], axis = 1)

display(df1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.386695,0.349389,-0.563930,-0.219939,0.026963,-0.602474,0
1,-0.209460,-0.087578,-0.918400,-0.219939,-0.734253,-0.602474,0
2,1.288982,0.071319,1.079522,1.462988,-1.534506,1.037171,1
3,-0.886175,0.071319,-0.435031,-1.704875,-1.183175,1.037171,0
4,1.143972,-1.517651,-0.660603,-0.219939,0.065999,-0.602474,0
...,...,...,...,...,...,...,...
210,1.627340,0.071319,1.707901,1.792974,0.495403,2.676817,1
211,-1.466217,0.230216,0.048337,-0.219939,-0.968473,-0.602474,0
212,-1.369543,-1.438203,-1.546778,-0.813914,0.163591,-0.602474,0
213,-1.256757,-1.358754,-1.192308,1.858971,-1.339322,-0.602474,1


## Saving the transformed data

In [14]:
df1.to_csv('attemps/tfm1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 1
    </h2>
</div>

## Make a copy of every column

In [15]:
dfCleanHeadlineText1 = dfCleanHeadlineText.copy()
dfCleanAuthor1 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim1 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle1 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords1 = dfCleanKeywords.copy()
dfCleanSource1 = dfCleanSource.copy()
dfCleanRatingName1 = dfCleanRatingName.copy()

## Transform data

In [16]:
dfCleanHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])
dfCleanHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])

dfCleanAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanAuthor1), columns = ['Author'])
dfCleanAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanAuthor1), columns = ['Author'])

dfCleanNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanKeywords1), columns = ['Keywords'])
dfCleanKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanKeywords1), columns = ['Keywords'])

dfCleanSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanSource1), columns = ['Source'])
dfCleanSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanSource1), columns = ['Source'])

dfCleanRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName1), columns = ['RatingName'])

dfClean1 = pd.concat([dfCleanHeadlineText1, dfCleanAuthor1, dfCleanNamedEntitiesClaim1, dfCleanNamedEntitiesArticle1, dfCleanKeywords1, dfCleanSource1, dfCleanRatingName1], axis = 1)

display(dfClean1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.241684,0.349389,-0.966736,-0.158398,0.352228,-0.602474,0
1,-0.418919,-0.087578,-1.385656,-0.158398,-0.444121,-0.602474,0
2,1.433992,0.071319,0.692828,1.811483,-1.278391,1.037171,1
3,-1.611227,0.071319,-0.853951,-1.794401,-0.975020,1.037171,0
4,1.256757,-1.517651,-1.063410,-0.158398,0.390149,-0.602474,0
...,...,...,...,...,...,...,...
210,-1.643452,0.071319,1.724013,2.212137,1.015852,2.676817,1
211,-1.530666,0.230216,-0.322245,-0.158398,-0.690610,-0.602474,0
212,0.789501,-1.438203,1.611227,-0.759379,0.541835,-0.602474,0
213,-1.305094,-1.358754,-1.530666,-2.028116,-1.695526,-0.602474,1


## Saving the transformed data

In [17]:
dfClean1.to_csv('attemps/tfmclean1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 1
    </h2>
</div>

## Make a copy of every column

In [18]:
dfCleanUpsampleHeadlineText1 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor1 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim1 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle1 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords1 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource1 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName1 = dfCleanUpsampleRatingName.copy()

## Transform data

In [19]:
dfCleanUpsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanUpsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanUpsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])
dfCleanUpsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])

dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanUpsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])
dfCleanUpsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])

dfCleanUpsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])
dfCleanUpsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])

dfCleanUpsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName1), columns = ['RatingName'])

dfCleanUpsample1 = pd.concat([dfCleanUpsampleHeadlineText1, dfCleanUpsampleAuthor1, dfCleanUpsampleNamedEntitiesClaim1, dfCleanUpsampleNamedEntitiesArticle1, dfCleanUpsampleKeywords1, dfCleanUpsampleSource1, dfCleanUpsampleRatingName1], axis = 1)

display(dfCleanUpsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.649640,0.068817,-1.650093,-0.115344,-1.238207,1.104967,1
1,0.779928,2.021802,-0.000076,-0.037796,0.859409,-0.580816,0
2,1.598879,2.066188,0.634546,0.078525,0.487956,-0.580816,1
3,0.835765,-1.085220,0.018056,-1.549980,1.318263,-0.580816,1
4,0.128489,0.068817,0.888395,1.086648,1.165311,1.104967,0
...,...,...,...,...,...,...,...
233,0.966053,-1.884168,-0.670962,-0.115344,0.487956,-0.580816,0
234,0.817153,0.512677,0.235641,-0.115344,-0.757503,-0.580816,0
235,-1.174388,-1.307150,0.108716,0.815230,-0.189399,-0.580816,1
236,0.779928,2.021802,-0.000076,-0.037796,0.859409,-0.580816,0


## Saving the transformed data

In [20]:
dfCleanUpsample1.to_csv('attemps/tfmcleanupsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 1
    </h2>
</div>

## Make a copy of every column

In [21]:
dfCleanDownsampleHeadlineText1 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor1 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim1 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle1 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords1 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource1 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName1 = dfCleanDownsampleRatingName.copy()

## Transform data

In [22]:
dfCleanDownsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanDownsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanDownsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])
dfCleanDownsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])
dfCleanDownsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])

dfCleanDownsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])
dfCleanDownsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])

dfCleanDownsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName1), columns = ['RatingName'])

dfCleanDownsample1 = pd.concat([dfCleanDownsampleHeadlineText1, dfCleanDownsampleAuthor1, dfCleanDownsampleNamedEntitiesClaim1, dfCleanDownsampleNamedEntitiesArticle1, dfCleanDownsampleKeywords1, dfCleanDownsampleSource1, dfCleanDownsampleRatingName1], axis = 1)

display(dfCleanDownsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-1.578714,-1.841973,-0.983313,-0.163674,-1.674983,-0.550035,1
1,-0.135318,-0.353580,0.532252,-0.163674,0.249618,-0.550035,1
2,0.748761,0.142552,1.326119,-0.163674,1.672150,1.126262,1
3,-0.027064,0.142552,1.362204,0.411621,1.253758,1.126262,1
4,1.326119,0.142552,-1.073525,1.634122,0.458814,1.126262,1
...,...,...,...,...,...,...,...
187,1.524586,1.961700,0.658549,-0.163674,-0.880039,-0.550035,0
188,0.622464,-1.221809,-1.019398,0.950960,-0.566245,-0.550035,0
189,-1.524586,0.307929,-0.315743,-0.163674,-0.712682,-0.550035,0
190,0.838973,-1.428531,1.614798,-0.738969,0.542493,-0.550035,0


## Saving the transformed data

In [23]:
dfCleanDownsample1.to_csv('attemps/tfmcleandownsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF2
    </h2>
</div>

## Make a copy of every column

In [24]:
dfHeadlineText2 = dfHeadlineText.copy()
dfAuthor2 = dfAuthor.copy()
dfNamedEntitiesClaim2 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle2 = dfNamedEntitiesArticle.copy()
dfKeywords2 = dfKeywords.copy()
dfSource2 = dfSource.copy()
dfRatingName2 = dfRatingName.copy()

## Transform data

In [25]:
dfHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfSource2 = pd.get_dummies(dfSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName2), columns = ['RatingName'])

df2 = pd.concat([dfHeadlineText2, dfSource2, dfRatingName2], axis = 1)

display(df2)

Unnamed: 0,000,10,100,106,11,115,12,120,14,142,...,you,your,yucca,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.241060,9.503480,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,0
1,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,0
2,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,0,1,0,1
3,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,0,1,0,0
4,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,0,0,1,1
211,4.724125,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,0
212,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,0
213,-0.241060,-0.118131,-0.068359,-0.068359,-0.095489,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.170525,-0.095476,-0.068359,-0.068359,0,0,1,0,0,1


## Saving the transformed data

In [26]:
df2.to_csv('attemps/tfm2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 2
    </h2>
</div>

## Make a copy of every column

In [27]:
dfCleanHeadlineText2 = dfCleanHeadlineText.copy()
dfCleanAuthor2 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim2 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle2 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords2 = dfCleanKeywords.copy()
dfCleanSource2 = dfCleanSource.copy()
dfCleanRatingName2 = dfCleanRatingName.copy()

## Transform data

In [28]:
dfCleanHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanSource2 = pd.get_dummies(dfCleanSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName2), columns = ['RatingName'])

dfClean2 = pd.concat([dfCleanHeadlineText2, dfCleanSource2, dfCleanRatingName2], axis = 1)

display(dfClean2)

Unnamed: 0,1000,106000,1270,14,15th,16,16000,16b,1729000000,1940s,...,york,yost,yucca,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,0
1,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,0
2,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,0,1,0,1
3,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,0,1,0,0
4,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,0,0,1,1
211,14.628739,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,0
212,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,0
213,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.114732,-0.068359,-0.068359,-0.068359,0,0,1,0,0,1


## Saving the transformed data

In [29]:
dfClean2.to_csv('attemps/tfmclean2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 2
    </h2>
</div>

## Make a copy of every column

In [30]:
dfCleanUpsampleHeadlineText2 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor2 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim2 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle2 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords2 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource2 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName2 = dfCleanUpsampleRatingName.copy()

## Transform data

In [31]:
dfCleanUpsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanUpsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanUpsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanUpsampleSource2 = pd.get_dummies(dfCleanUpsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanUpsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName2), columns = ['RatingName'])

dfCleanUpsample2 = pd.concat([dfCleanUpsampleHeadlineText2, dfCleanUpsampleSource2, dfCleanUpsampleRatingName2], axis = 1)

display(dfCleanUpsample2)

Unnamed: 0,106000,1270,15th,16,16000,16b,1729000000,1940s,200,20072008,...,years,york,yucca,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,0,1,0,1
1,-0.064957,-0.064957,-0.064957,-0.092057,7.648529,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,0
2,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,15.394804,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,1
3,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,1
4,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,0
234,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,0
235,-0.064957,-0.064957,-0.064957,-0.092057,-0.130744,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,1
236,-0.064957,-0.064957,-0.064957,-0.092057,7.648529,-0.092057,-0.064957,-0.064957,-0.064957,-0.064957,...,-0.230684,-0.109005,-0.064957,-0.064957,0,0,1,0,0,0


## Saving the transformed data

In [32]:
dfCleanUpsample2.to_csv('attemps/tfmcleanupsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 2
    </h2>
</div>

## Make a copy of every column

In [33]:
dfCleanDownsampleHeadlineText2 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor2 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim2 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle2 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords2 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource2 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName2 = dfCleanDownsampleRatingName.copy()

## Transform data

In [34]:
dfCleanDownsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleSource2 = pd.get_dummies(dfCleanDownsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanDownsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName2), columns = ['RatingName'])

# Pas encore traité
dfCleanDownsampleAuthor2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])
dfCleanDownsampleAuthor2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleNamedEntitiesClaim2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])
dfCleanDownsampleKeywords2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])

dfCleanDownsample2 = pd.concat([dfCleanDownsampleHeadlineText2, dfCleanDownsampleSource2, dfCleanDownsampleRatingName2], axis = 1)

display(dfCleanDownsample2)

Unnamed: 0,1000,106000,1270,14,15th,16,16000,16b,1729000000,1940s,...,york,yost,yucca,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.072357,-0.072357,13.820275,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,1
1,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,1
2,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,0,1,0,1
3,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,0,1,0,1
4,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,0
188,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,0
189,13.820275,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,0
190,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,-0.072357,...,-0.121348,-0.072357,-0.072357,-0.072357,0,0,1,0,0,0


## Saving the transformed data

In [35]:
dfCleanDownsample2.to_csv('attemps/tfmcleandownsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF3
    </h2>
</div>

## Make a copy of every column

In [36]:
dfHeadlineText3 = dfHeadlineText.copy()
dfAuthor3 = dfAuthor.copy()
dfNamedEntitiesClaim3 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle3 = dfNamedEntitiesArticle.copy()
dfKeywords3 = dfKeywords.copy()
dfSource3 = dfSource.copy()
dfRatingName3 = dfRatingName.copy()

## Transform data

In [37]:
dfHeadlineText3 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(tfidfTransformer1.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText3), columns = tfidfVectorizer.get_feature_names())

dfSource3 = pd.get_dummies(dfSource3, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName3 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName3), columns = ['RatingName'])

df3 = pd.concat([dfHeadlineText3, dfSource3, dfRatingName3], axis = 1)

display(df3)

Unnamed: 0,000,10,100,106,11,115,12,120,14,142,...,you,your,yucca,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.240663,9.294578,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,0
1,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,0
2,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,0,1,0,1
3,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,0,1,0,0
4,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,0,0,1,1
211,4.818027,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,0
212,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,0
213,-0.240663,-0.117908,-0.068359,-0.068359,-0.095188,-0.068359,-0.068359,-0.068359,-0.068359,-0.068359,...,-0.167005,-0.095093,-0.068359,-0.068359,0,0,1,0,0,1


## Saving the transformed data

In [38]:
df3.to_csv('attemps/tfm3.csv', sep = ';', index = False)