<div align="center"><h1> Data Science Project </h1></div>
<div align="center"><h2> Classification of assertions according to their veracity values ( automatic fact-checking ) </h2></div>
<h2>Group member</h2>
<ul>
    <li>Meriem AMERAOUI</li>
    <li>Dounia BELABIOD</li>
    <li>Jihene BOUHLEL</li>
    <li>Bahaa Eddine NIL</li>
</ul>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Basics
    </h1>
</div>

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import inflect
import re
import nltk
import contractions

from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def separate_letter_number(words):
    new_words = []
    for word in words:
        nw = re.findall('\d+|\D+', word)
        new_words.append(nw)
    new_words = reduce(lambda x,y: x+y,new_words)
    return new_words

def replace_contractions(text):
    return 

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    #words = separate_letter_number(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    porterStemmer = PorterStemmer()
    lancasterStemmer = LancasterStemmer()
    wordNetLemmatizer = WordNetLemmatizer()
    #tokens = word_tokenize(contractions.fix(text))
    tokens = word_tokenize(text)
    tokens = normalize(tokens)
    #tokens = [porterStemmer.stem(word) for word in tokens]
    #tokens = [lancasterStemmer.stem(word) for word in tokens]
    #tokens = [wordNetLemmatizer.lemmatize(word, pos = 'v') for word in tokens]
    text="".join([" "+i for i in tokens]).strip()
    return text

# Up-sampling & Down-sampling
def sampling(df_func, *args, **kwargs):
    sampling_type = kwargs.get("sample", None)
    if(sampling_type not in ['up', 'down']):
        print('Please select somthing in [\'up\', \'down\']')

    else:
        majority = df_func[df_func.RatingName == df_func['RatingName'].value_counts()
                           .index.tolist()[0]].reset_index(drop = True)
        minority = df_func[df_func.RatingName == df_func['RatingName'].value_counts()
                           .index.tolist()[-1]].reset_index(drop = True)

        if(sampling_type == 'up'):
            df_func = resample(minority, replace = True, n_samples = df_func['RatingName']
                               .value_counts().tolist()[0], random_state = 123)
            df_func = pd.concat([majority, df_func]).sample(frac = 1).reset_index(drop = True)

        if(sampling_type == 'down'):
            df_func = resample(majority, replace=False, n_samples=df_func['RatingName']
                               .value_counts().tolist()[-1], random_state=123) 
            df_func = pd.concat([df_func, minority]).reset_index(drop = True)

        return df_func

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Reading data
    </h1>
</div>

In [2]:
df = pd.read_csv('datasets/generated.csv', sep = ';')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [3]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,Malia Obama cashed a $1.2 million tax refund c...,OTHER,Unknown,Did Malia Obama Cash a $1.2 Million Check?,"Facebook,Fan Fiction,Junk News,Malia Obama,Sno...",Malia Obama,,truthorfiction
1,High diver is saved from jumping into a draine...,OTHER,Unknown,High Diver Saved By Cross,"Cincinnati Post,Islam,Scripture lesson,Univers...",shadow on the wall,ASP Article,snopes
2,'And the revenue generated by drilling off Vir...,MIXTURE,Jim Moran,Moran says drilling off Virginia's coast will ...,"Alaska,American Petroleum Institute,Atlantic O...",,"Energy,State Finances",politifact
3,Health insurance companies pay CEOs $24 millio...,MIXTURE,Health Care for America Now,Health care advocacy group blasts insurers for...,"Aetna,Assurant,Bloomberg News,Cigna,Coventry H...",,"Corporations,Health Care",politifact
4,Ted Cruz said that veterans should start selli...,FALSE,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes


## Remove unnecessary rows

In [4]:
# Deleting claims with OTHER & MIXTURE RatingName
df = df[df.RatingName != 'OTHER']

display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,840,840,840,840,839,523,801,840
unique,840,3,339,840,839,470,581,5
top,One of the 'main functions' of the Department ...,FALSE,Unknown,Florida Democrats say Donald Trump denied over...,"Donald J. Trump,Electoral College,Eric Trump,O...",Donald Trump,ASP Article,politifact
freq,1,358,345,1,1,12,31,493


## Replacing "Unknown" & NaN by "Inconnue"

In [5]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,840,840,840,840,840,840,840,840
unique,840,3,339,840,840,471,582,5
top,One of the 'main functions' of the Department ...,FALSE,Inconnue,Florida Democrats say Donald Trump denied over...,"Donald J. Trump,Electoral College,Eric Trump,O...",Inconnue,Inconnue,politifact
freq,1,358,345,1,1,317,39,493


## Rename TRUE FALSE BY NONMIXTURE

In [6]:
df['RatingName'].replace(to_replace = 'TRUE', value = 'nonmixture', inplace = True)
df['RatingName'].replace(to_replace = 'FALSE', value = 'nonmixture', inplace = True)

display(df.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,840,840,840,840,840,840,840,840
unique,840,2,339,840,840,471,582,5
top,One of the 'main functions' of the Department ...,nonmixture,Inconnue,Florida Democrats say Donald Trump denied over...,"Donald J. Trump,Electoral College,Eric Trump,O...",Inconnue,Inconnue,politifact
freq,1,488,345,1,1,317,39,493


## Cleaning

In [7]:
dfClean = df.copy()
for column in dfClean.columns:
    dfClean[column] = dfClean[column].apply(lambda x: clean_text(x))

display(dfClean.head())
display(dfClean.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
2,revenue generated drilling virginia coast fort...,mixture,jim moran,moran says drilling virginia coast net forty m...,alaska american petroleum institute atlantic o...,inconnue,energy state finances,politifact
3,health insurance companies pay ceos twentyfour...,mixture,health care america,health care advocacy group blasts insurers ceo...,aetna assurant bloomberg news cigna coventry h...,inconnue,corporations health care,politifact
4,ted cruz said veterans start selling cookies o...,nonmixture,inconnue,ted cruz vets sell cookies funding like girl s...,david nelson james morrison john scalzi republ...,ted cruz,asp article necessarily news,snopes
5,passengers airliner diverted cuba thought hija...,mixture,inconnue,airline passengers mistake hijacking candid ca...,allen funt associated press cbs candid camera ...,allen funt candid camera cuba,broadcast legends radio tv television,snopes
6,oil money federal leases used clean mess damag...,mixture,bill nelson,sen bill nelson says offshore drilling wo nt p...,alabama bill nelson dfla florida panhandle gul...,inconnue,environment,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,840,840,840,840,840,840,840,840
unique,840,2,339,840,840,467,582,5
top,austin 1000 city employees make sixfigure sala...,nonmixture,inconnue,rep virginia foxx warns mismatch skills availa...,brett kavanaugh christine blasey ford facebook...,inconnue,inconnue,politifact
freq,1,488,345,1,1,317,39,493


## Upsampling

In [8]:
dfCleanUpsample = sampling(dfClean, sample = 'up')
display(dfCleanUpsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,976,976,976,976,976,976,976,976
unique,745,2,297,745,745,428,524,5
top,gov romney plan would cut taxes folks top,mixture,inconnue,barack obama says mitt romney tax plan would c...,alternative minimum tax barack obama bush tax ...,inconnue,inconnue,politifact
freq,8,488,368,8,8,372,46,606


## Downsampling

In [9]:
dfCleanDownsample = sampling(dfClean, sample = 'down')
display(dfCleanDownsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,704,704,704,704,704,704,704,704
unique,704,2,308,704,704,392,505,5
top,dozens artists including justin timberlake jay...,mixture,inconnue,rep virginia foxx warns mismatch skills availa...,ann coulter barbara ehrenreich domestic violen...,inconnue,inconnue,politifact
freq,1,352,273,1,1,269,35,429


At this point we have df, dfClean, dfCleanUpsample and dfCleanedDownsample

<div class="alert alert-block alert-info" align="center">
    <h1>
        Encodage
    </h1>
</div>

In [10]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer = TfidfVectorizer()

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

## Splitting the dataframe

In [11]:
# df
dfHeadlineText = df["Headline"] + " " + df["Text"]
dfRatingName = df['RatingName']
dfAuthor = df['Author']
dfNamedEntitiesClaim = df['NamedEntitiesClaim']
dfNamedEntitiesArticle = df['NamedEntitiesArticle']
dfKeywords = df['Keywords']
dfSource = df['Source']

# dfClean
dfCleanHeadlineText = dfClean["Headline"] + " " + dfClean["Text"]
dfCleanRatingName = dfClean['RatingName']
dfCleanAuthor = dfClean['Author']
dfCleanNamedEntitiesClaim = dfClean['NamedEntitiesClaim']
dfCleanNamedEntitiesArticle = dfClean['NamedEntitiesArticle']
dfCleanKeywords = dfClean['Keywords']
dfCleanSource = dfClean['Source']

# dfCleanUpsample
dfCleanUpsampleHeadlineText = dfCleanUpsample["Headline"] + " " + dfCleanUpsample["Text"]
dfCleanUpsampleRatingName = dfCleanUpsample['RatingName']
dfCleanUpsampleAuthor = dfCleanUpsample['Author']
dfCleanUpsampleNamedEntitiesClaim = dfCleanUpsample['NamedEntitiesClaim']
dfCleanUpsampleNamedEntitiesArticle = dfCleanUpsample['NamedEntitiesArticle']
dfCleanUpsampleKeywords = dfCleanUpsample['Keywords']
dfCleanUpsampleSource = dfCleanUpsample['Source']

# dfCleanDownsample
dfCleanDownsampleHeadlineText = dfCleanDownsample["Headline"] + " " + dfCleanDownsample["Text"]
dfCleanDownsampleRatingName = dfCleanDownsample['RatingName']
dfCleanDownsampleAuthor = dfCleanDownsample['Author']
dfCleanDownsampleNamedEntitiesClaim = dfCleanDownsample['NamedEntitiesClaim']
dfCleanDownsampleNamedEntitiesArticle = dfCleanDownsample['NamedEntitiesArticle']
dfCleanDownsampleKeywords = dfCleanDownsample['Keywords']
dfCleanDownsampleSource = dfCleanDownsample['Source']

<div align="center">
    <h2>
        TF 1
    </h2>
</div>

## Make a copy of every column

In [12]:
dfHeadlineText1 = dfHeadlineText.copy()
dfAuthor1 = dfAuthor.copy()
dfNamedEntitiesClaim1 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle1 = dfNamedEntitiesArticle.copy()
dfKeywords1 = dfKeywords.copy()
dfSource1 = dfSource.copy()
dfRatingName1 = dfRatingName.copy()

## Transform data

In [13]:
dfHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfHeadlineText1), 
                               columns = ['HeadlineText'])
dfHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText1), 
                               columns = ['HeadlineText'])

dfAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfAuthor1), 
                         columns = ['Author'])
dfAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfAuthor1), 
                         columns = ['Author'])

dfNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesClaim1), 
                                     columns = ['NamedEntitiesClaim'])
dfNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesClaim1), 
                                     columns = ['NamedEntitiesClaim'])

dfNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesArticle1), 
                                       columns = ['NamedEntitiesArticle'])
dfNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesArticle1), 
                                       columns = ['NamedEntitiesArticle'])

dfKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfKeywords1), 
                           columns = ['Keywords'])
dfKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfKeywords1), 
                           columns = ['Keywords'])

dfSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfSource1), 
                         columns = ['Source'])
dfSource1 = pd.DataFrame(standardScaler.fit_transform(dfSource1), 
                         columns = ['Source'])

dfRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName1), 
                             columns = ['RatingName'])

df1 = pd.concat([dfHeadlineText1, dfAuthor1, dfNamedEntitiesClaim1, dfNamedEntitiesArticle1, 
                 dfKeywords1, dfSource1, dfRatingName1], axis = 1)

display(df1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.422703,0.098369,-0.682511,-0.113186,-0.006815,-0.648644,0
1,-0.175267,-0.221903,-0.954691,-0.113186,-0.701226,-0.648644,0
2,1.305225,-0.157849,1.169135,1.384441,-1.558293,1.033025,1
3,-0.962938,-0.157849,-0.571165,-1.778249,-1.145400,1.033025,0
4,1.156763,-1.464561,-0.731998,-0.113186,0.005697,-0.648644,0
...,...,...,...,...,...,...,...
835,-1.503174,-1.643914,0.220630,-0.755026,-0.476012,-0.648644,1
836,-0.674263,-1.362074,0.633024,-0.113186,-0.338381,-0.648644,1
837,0.162895,-0.157849,0.954691,-0.113186,-1.677157,1.033025,0
838,1.532041,-0.157849,-1.358836,0.147271,1.563432,1.033025,1


## Saving the transformed data

In [14]:
df1.to_csv('attemps/tfm1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 1
    </h2>
</div>

## Make a copy of every column

In [15]:
dfCleanHeadlineText1 = dfCleanHeadlineText.copy()
dfCleanAuthor1 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim1 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle1 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords1 = dfCleanKeywords.copy()
dfCleanSource1 = dfCleanSource.copy()
dfCleanRatingName1 = dfCleanRatingName.copy()

## Transform data

In [16]:
dfCleanHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanHeadlineText1), 
                                    columns = ['HeadlineText'])
dfCleanHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText1), 
                                    columns = ['HeadlineText'])

dfCleanAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanAuthor1), 
                              columns = ['Author'])
dfCleanAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanAuthor1), 
                              columns = ['Author'])

dfCleanNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesClaim1), 
                                          columns = ['NamedEntitiesClaim'])
dfCleanNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesClaim1), 
                                          columns = ['NamedEntitiesClaim'])

dfCleanNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesArticle1), 
                                            columns = ['NamedEntitiesArticle'])
dfCleanNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesArticle1), 
                                            columns = ['NamedEntitiesArticle'])

dfCleanKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanKeywords1), 
                                columns = ['Keywords'])
dfCleanKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanKeywords1), 
                                columns = ['Keywords'])

dfCleanSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanSource1), 
                              columns = ['Source'])
dfCleanSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanSource1), 
                              columns = ['Source'])

dfCleanRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName1), 
                                  columns = ['RatingName'])

dfClean1 = pd.concat([dfCleanHeadlineText1, dfCleanAuthor1, dfCleanNamedEntitiesClaim1, dfCleanNamedEntitiesArticle1, 
                      dfCleanKeywords1, dfCleanSource1, dfCleanRatingName1], axis = 1)

display(dfClean1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.245374,0.097587,-1.127896,-0.005501,0.448263,-0.648644,0
1,-0.393836,-0.223172,-1.441315,-0.005501,-0.393002,-0.648644,0
2,1.367084,-0.159020,0.760866,1.767348,-1.303412,1.033025,1
3,-1.651635,-0.159020,-1.028921,-1.919800,-0.957686,1.033025,0
4,1.152639,-1.454884,-1.177383,-0.005501,0.459787,-0.648644,0
...,...,...,...,...,...,...,...
835,-1.527917,-1.634509,-0.183515,-0.703324,-0.122184,-0.648644,1
836,0.628900,-1.352242,0.224754,-0.005501,0.119824,-0.648644,1
837,-0.105160,-0.159020,0.496934,-0.005501,-1.487798,1.033025,0
838,-0.162895,-0.159020,1.602148,0.277401,0.367594,1.033025,1


## Saving the transformed data

In [17]:
dfClean1.to_csv('attemps/tfmclean1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 1
    </h2>
</div>

## Make a copy of every column

In [18]:
dfCleanUpsampleHeadlineText1 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor1 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim1 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle1 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords1 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource1 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName1 = dfCleanUpsampleRatingName.copy()

## Transform data

In [19]:
dfCleanUpsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleHeadlineText1), 
                                            columns = ['HeadlineText'])
dfCleanUpsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText1), 
                                            columns = ['HeadlineText'])

dfCleanUpsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleAuthor1), 
                                      columns = ['Author'])
dfCleanUpsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleAuthor1), 
                                      columns = ['Author'])

dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), 
                                                  columns = ['NamedEntitiesClaim'])
dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), 
                                                  columns = ['NamedEntitiesClaim'])

dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), 
                                                    columns = ['NamedEntitiesArticle'])
dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), 
                                                    columns = ['NamedEntitiesArticle'])

dfCleanUpsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleKeywords1), 
                                        columns = ['Keywords'])
dfCleanUpsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleKeywords1), 
                                        columns = ['Keywords'])

dfCleanUpsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleSource1), 
                                      columns = ['Source'])
dfCleanUpsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleSource1), 
                                      columns = ['Source'])

dfCleanUpsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName1), 
                                          columns = ['RatingName'])

dfCleanUpsample1 = pd.concat([dfCleanUpsampleHeadlineText1, dfCleanUpsampleAuthor1, dfCleanUpsampleNamedEntitiesClaim1, 
                              dfCleanUpsampleNamedEntitiesArticle1, dfCleanUpsampleKeywords1, dfCleanUpsampleSource1, 
                              dfCleanUpsampleRatingName1], axis = 1)

display(dfCleanUpsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.406268,1.747303,-0.807133,-0.018744,0.226444,-0.618923,0
1,0.093583,-0.129688,0.158028,-0.712800,-1.039616,1.116907,0
2,-1.092067,1.933570,1.726415,-0.018744,1.453145,-0.618923,0
3,-0.085205,0.472096,-1.503162,-2.090554,0.705317,-0.618923,0
4,-1.204986,-1.018034,0.506043,-0.018744,1.446585,-0.618923,0
...,...,...,...,...,...,...,...
971,-1.299085,1.317458,0.106986,-0.018744,-1.630007,-0.618923,0
972,-1.317905,-0.043719,1.522246,-0.018744,0.272364,-0.618923,0
973,-0.334568,-0.129688,-0.087902,-0.153412,-0.554184,1.116907,1
974,0.978116,-0.129688,0.060584,1.421164,-1.557848,1.116907,0


## Saving the transformed data

In [20]:
dfCleanUpsample1.to_csv('attemps/tfmcleanupsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 1
    </h2>
</div>

## Make a copy of every column

In [21]:
dfCleanDownsampleHeadlineText1 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor1 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim1 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle1 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords1 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource1 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName1 = dfCleanDownsampleRatingName.copy()

## Transform data

In [22]:

dfCleanDownsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleHeadlineText1), 
                                              columns = ['HeadlineText'])
dfCleanDownsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText1), 
                                              columns = ['HeadlineText'])

dfCleanDownsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor1), 
                                        columns = ['Author'])
dfCleanDownsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleAuthor1), 
                                        columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), 
                                                    columns = ['NamedEntitiesClaim'])
dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), 
                                                    columns = ['NamedEntitiesClaim'])

dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), 
                                                      columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), 
                                                      columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords1), 
                                          columns = ['Keywords'])
dfCleanDownsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleKeywords1), 
                                          columns = ['Keywords'])

dfCleanDownsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleSource1), 
                                        columns = ['Source'])
dfCleanDownsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleSource1), 
                                        columns = ['Source'])

dfCleanDownsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName1), 
                                            columns = ['RatingName'])

dfCleanDownsample1 = pd.concat([dfCleanDownsampleHeadlineText1, dfCleanDownsampleAuthor1, 
                                dfCleanDownsampleNamedEntitiesClaim1, dfCleanDownsampleNamedEntitiesArticle1, 
                                dfCleanDownsampleKeywords1, dfCleanDownsampleSource1, dfCleanDownsampleRatingName1], 
                               axis = 1)

display(dfCleanDownsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-0.504362,-0.526022,0.706107,0.641511,-0.118945,-0.616084,1
1,-1.586895,-0.146855,1.178485,0.045226,-0.574328,1.071554,1
2,-0.794678,-0.146855,1.434356,1.924086,0.832004,1.071554,1
3,-1.021025,-1.059664,0.553568,0.045226,1.468202,-0.616084,1
4,1.291659,-0.146855,-1.675466,-2.182404,-0.051976,-3.991360,1
...,...,...,...,...,...,...,...
699,1.050549,1.678762,-0.829122,-1.867386,0.932456,-0.616084,0
700,1.158802,1.833237,0.784836,0.045226,1.528473,-0.616084,0
701,-0.740551,1.032774,-1.084993,0.045226,0.215896,-0.616084,0
702,-0.012302,-1.733738,-1.409753,0.045226,0.852094,-0.616084,0


## Saving the transformed data

In [23]:
dfCleanDownsample1.to_csv('attemps/tfmcleandownsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF2
    </h2>
</div>

## Make a copy of every column

In [24]:
dfHeadlineText2 = dfHeadlineText.copy()
dfAuthor2 = dfAuthor.copy()
dfNamedEntitiesClaim2 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle2 = dfNamedEntitiesArticle.copy()
dfKeywords2 = dfKeywords.copy()
dfSource2 = dfSource.copy()
dfRatingName2 = dfRatingName.copy()

## Transform data

In [25]:
dfHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText2).toarray(), 
                               columns = tfidfVectorizer.get_feature_names())
dfHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText2), 
                               columns = tfidfVectorizer.get_feature_names())

dfSource2 = pd.get_dummies(dfSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName2), columns = ['RatingName'])

df2 = pd.concat([dfHeadlineText2, dfSource2, dfRatingName2], axis = 1)

display(df2)

Unnamed: 0,000,05,07,08,10,100,100k,101st,102,106,...,zipper,zippered,zoning,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.220509,-0.034524,-0.034524,-0.034524,7.442563,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
1,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
2,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,1
3,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,0
4,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,1
836,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,1
837,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,0
838,-0.220509,-0.034524,-0.034524,-0.034524,-0.130017,-0.093071,-0.048656,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,1


## Saving the transformed data

In [26]:
df2.to_csv('attemps/tfm2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 2
    </h2>
</div>

## Make a copy of every column

In [27]:
dfCleanHeadlineText2 = dfCleanHeadlineText.copy()
dfCleanAuthor2 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim2 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle2 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords2 = dfCleanKeywords.copy()
dfCleanSource2 = dfCleanSource.copy()
dfCleanRatingName2 = dfCleanRatingName.copy()

## Transform data

In [28]:
dfCleanHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanHeadlineText2).toarray(), 
                                    columns = tfidfVectorizer.get_feature_names())
dfCleanHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText2), 
                                    columns = tfidfVectorizer.get_feature_names())

dfCleanSource2 = pd.get_dummies(dfCleanSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName2), columns = ['RatingName'])

dfClean2 = pd.concat([dfCleanHeadlineText2, dfCleanSource2, dfCleanRatingName2], axis = 1)

display(dfClean2)

Unnamed: 0,05,1000,10000,100000,100k,101st,102000,105,106000,11yearold,...,zipper,zippered,zoning,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
1,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
2,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,1
3,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,0
4,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,1
836,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,1,0,0,1
837,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,0
838,-0.04709,-0.05896,-0.034524,-0.054431,-0.048634,-0.034524,-0.034524,-0.034524,-0.034524,-0.034524,...,-0.034524,-0.034524,-0.034524,-0.034524,0,0,0,1,0,1


## Saving the transformed data

In [29]:
dfClean2.to_csv('attemps/tfmclean2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 2
    </h2>
</div>

## Make a copy of every column

In [30]:
dfCleanUpsampleHeadlineText2 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor2 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim2 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle2 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords2 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource2 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName2 = dfCleanUpsampleRatingName.copy()

## Transform data

In [31]:
dfCleanUpsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanUpsampleHeadlineText2).toarray(), 
                                            columns = tfidfVectorizer.get_feature_names())
dfCleanUpsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText2), 
                                            columns = tfidfVectorizer.get_feature_names())

dfCleanUpsampleSource2 = pd.get_dummies(dfCleanUpsampleSource2, 
                                        columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanUpsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName2), 
                                          columns = ['RatingName'])

dfCleanUpsample2 = pd.concat([dfCleanUpsampleHeadlineText2, dfCleanUpsampleSource2, dfCleanUpsampleRatingName2], 
                             axis = 1)

display(dfCleanUpsample2)

Unnamed: 0,05,1000,10000,100000,100k,101st,102000,105,106000,11yearold,...,zipper,zippered,zoning,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
1,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,0,1,0,0
2,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
3,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
4,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
972,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,1,0,0,0
973,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,0,1,0,1
974,-0.062505,-0.077751,-0.032026,-0.066648,-0.063779,-0.032026,-0.032026,-0.055527,-0.032026,-0.055527,...,-0.032026,-0.032026,-0.045314,-0.032026,0,0,0,1,0,0


## Saving the transformed data

In [32]:
dfCleanUpsample2.to_csv('attemps/tfmcleanupsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 2
    </h2>
</div>

## Make a copy of every column

In [33]:
dfCleanDownsampleHeadlineText2 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor2 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim2 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle2 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords2 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource2 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName2 = dfCleanDownsampleRatingName.copy()

## Transform data

In [34]:
dfCleanDownsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleHeadlineText2).toarray(), 
                                              columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText2), 
                                              columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleSource2 = pd.get_dummies(dfCleanDownsampleSource2, 
                                          columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanDownsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName2), 
                                            columns = ['RatingName'])

dfCleanDownsample2 = pd.concat([dfCleanDownsampleHeadlineText2, dfCleanDownsampleSource2, dfCleanDownsampleRatingName2], 
                               axis = 1)

display(dfCleanDownsample2)

Unnamed: 0,05,1000,10000,100000,100k,101st,102000,105,106000,11yearold,...,zipper,zippered,zoning,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,1
1,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,0,1,0,1
2,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,0,1,0,1
3,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,1
4,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,0
700,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,0
701,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,0
702,-0.051466,-0.064516,-0.037716,-0.049993,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,-0.037716,...,-0.037716,-0.037716,-0.037716,-0.037716,0,0,1,0,0,0


## Saving the transformed data

In [35]:
dfCleanDownsample2.to_csv('attemps/tfmcleandownsample2.csv', sep = ';', index = False)