<div align="center"><h1> Projet Data Science </h1></div>
<div align="center"><h2> Classification d'assertions selon leur valeurs de véracité ( automatic fact-checking ) </h2></div>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Executing the basic
    </h1>
</div>

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import inflect
import re
#import contractions

#from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def separate_letter_number(words):
    new_words = []
    for word in words:
        nw = re.findall('\d+|\D+', word)
        new_words.append(nw)
    new_words = reduce(lambda x,y: x+y,new_words)
    return new_words

def replace_contractions(text):
    return 

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    #words = separate_letter_number(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    #tokens = word_tokenize(contractions.fix(text))
    tokens = word_tokenize(text)
    tokens = normalize(tokens)
    text="".join([" "+i for i in tokens]).strip()
    return text

# Up-sampling & Down-sampling
def sampling(df_func, *args, **kwargs):
    sampling_type = kwargs.get("sample", None)
    if(sampling_type not in ['up', 'down']):
        print('Please select somthing in [\'up\', \'down\']')

    else:
        majority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[0]].reset_index(drop = True)
        minority = df_func[df_func.RatingName == df_func['RatingName'].value_counts().index.tolist()[-1]].reset_index(drop = True)

        if(sampling_type == 'up'):
            df_func = resample(minority, replace = True, n_samples = df_func['RatingName'].value_counts().tolist()[0], random_state = 123)
            df_func = pd.concat([majority, df_func]).sample(frac = 1).reset_index(drop = True)

        if(sampling_type == 'down'):
            df_func = resample(majority, replace=False, n_samples=df_func['RatingName'].value_counts().tolist()[-1], random_state=123) 
            df_func = pd.concat([df_func, minority]).reset_index(drop = True)

        return df_func

<div class="alert alert-block alert-info" align="center">
    <h1>
        Reading data
    </h1>
</div>

In [2]:
df = pd.read_csv('datasets/generated.csv', sep = ';')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [3]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,Malia Obama cashed a $1.2 million tax refund c...,OTHER,Unknown,Did Malia Obama Cash a $1.2 Million Check?,"Facebook,Fan Fiction,Junk News,Malia Obama,Sno...",Malia Obama,,truthorfiction
1,High diver is saved from jumping into a draine...,OTHER,Unknown,High Diver Saved By Cross,"Cincinnati Post,Islam,Scripture lesson,Univers...",shadow on the wall,ASP Article,snopes
2,'And the revenue generated by drilling off Vir...,MIXTURE,Jim Moran,Moran says drilling off Virginia's coast will ...,"Alaska,American Petroleum Institute,Atlantic O...",,"Energy,State Finances",politifact
3,Health insurance companies pay CEOs $24 millio...,MIXTURE,Health Care for America Now,Health care advocacy group blasts insurers for...,"Aetna,Assurant,Bloomberg News,Cigna,Coventry H...",,"Corporations,Health Care",politifact
4,Ted Cruz said that veterans should start selli...,FALSE,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes


## Remove unnecessary rows

In [4]:
# Deleting claims with OTHER é MIXTURE RatingName
df = df[df.RatingName != 'OTHER']
df = df[df.RatingName != 'MIXTURE']

display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,Ted Cruz said that veterans should start selli...,False,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes
7,'The Georgia Lottery Corp. has only once in th...,False,Allie McCullen,Luck runs out for student on lottery claim,"Georgia Lottery,HOPE scholarship,Mega Millions...","Georgia Lottery,HOPE scholarship",Education,politifact
9,Panda Express will be celebrating its 15th ann...,False,Unknown,Panda Express 15th Anniversary Offer,"Los Angeles Times,Panda Express,Panda Inn,Wend...",Panda Express,"ASP Article, something for nothing",snopes
10,'There’s no money' for Planned Parenthood in t...,True,Tom Cole,Is there Planned Parenthood funding in the bil...,"Children’s Health Insurance Program,Fox News S...",Planned Parenthood,"Abortion,Congress,Congressional Rules,Federal ...",politifact
11,"'In 2010 alone, 1,270 infants were reported to...",False,Americans United for Life,"Americans United for Life says 1,270 babies di...","Alan Guttmacher Institute,Americans United for...",,Abortion,politifact


## Replacing "Unknown" & NaN by "Inconnue"

In [5]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,Ted Cruz said that veterans should start selli...,False,Inconnue,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes
7,'The Georgia Lottery Corp. has only once in th...,False,Allie McCullen,Luck runs out for student on lottery claim,"Georgia Lottery,HOPE scholarship,Mega Millions...","Georgia Lottery,HOPE scholarship",Education,politifact
9,Panda Express will be celebrating its 15th ann...,False,Inconnue,Panda Express 15th Anniversary Offer,"Los Angeles Times,Panda Express,Panda Inn,Wend...",Panda Express,"ASP Article, something for nothing",snopes
10,'There’s no money' for Planned Parenthood in t...,True,Tom Cole,Is there Planned Parenthood funding in the bil...,"Children’s Health Insurance Program,Fox News S...",Planned Parenthood,"Abortion,Congress,Congressional Rules,Federal ...",politifact
11,"'In 2010 alone, 1,270 infants were reported to...",False,Americans United for Life,"Americans United for Life says 1,270 babies di...","Alan Guttmacher Institute,Americans United for...",Inconnue,Abortion,politifact


## Cleaning

In [6]:
dfClean = df.copy()
for column in dfClean.columns:
    dfClean[column] = dfClean[column].apply(lambda x: clean_text(x))

display(dfClean.head())
display(dfClean.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,ted cruz said veterans start selling cookies o...,False,inconnue,ted cruz vets sell cookies funding like girl s...,david nelson james morrison john scalzi republ...,ted cruz,asp article necessarily news,snopes
7,georgia lottery corp past sixteen years paid a...,False,allie mccullen,luck runs student lottery claim,georgia lottery hope scholarship mega millions...,georgia lottery hope scholarship,education,politifact
9,panda express celebrating 15th anniversary off...,False,inconnue,panda express 15th anniversary offer,los angeles times panda express panda inn wend...,panda express,asp article something nothing,snopes
10,money planned parenthood bill would keep gover...,True,tom cole,planned parenthood funding bill stops governme...,children health insurance program fox news sun...,planned parenthood,abortion congress congressional rules federal ...,politifact
11,two thousand and ten alone 1270 infants report...,False,americans united life,americans united life says 1270 babies died at...,alan guttmacher institute americans united lif...,inconnue,abortion,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,119,119,119,119,119,119,119,119
unique,119,2,44,119,119,76,98,5
top,parking lot carjackers placing flyers one hund...,false,inconnue,mail online claim 400000 poor whites south afr...,two thousand and seven general election art lo...,inconnue,inconnue,snopes
freq,1,86,68,1,1,42,9,60


# Upsampling

In [7]:
dfCleanUpsample = sampling(dfClean, sample = 'up')
display(dfCleanUpsample.head())
display(dfCleanUpsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,democrats said even one hearing stimulus bill ...,True,joe barton,barton correct congress getting little review ...,ritchie henry waxman house energy commerce com...,inconnue,economy stimulus,politifact
1,based information learned library books school...,True,inconnue,boy harnessed wind,african leadership academy daily show daily sh...,william kamkwamba,william kamkwamba,snopes
2,welfare recipients receive free cars rumor,False,inconnue,welfare recipients receive free cars rumorfiction,facebook rich buhler white house fake news jquery,inconnue,inconnue,truthorfiction
3,says maryellen shaughnessy accepted campaign c...,False,ohio republican party,ohio republican party flier tries tie maryelle...,cuyahoga county franklin county jimmy dimora j...,jimmy dimora maryellen shaughnessy,negative campaigning,politifact
4,fourteen years since president vice president ...,True,candy crowley,cnn candy crowley says unusual president famil...,two thousand summer games al gore athens austr...,inconnue,sports,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,172,172,172,172,172,172,172,172
unique,116,2,42,116,116,74,96,5
top,says people signed recall petitions wisconsin ...,false,inconnue,wisconsin tea party leader says democrathired ...,democrat eagle river jim holperin politifact r...,inconnue,inconnue,snopes
freq,6,86,91,6,6,68,14,80


# Downsampling

In [8]:
dfCleanDownsample = sampling(dfClean, sample = 'down')
display(dfCleanDownsample.head())
display(dfCleanDownsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,came office vaccination rate texas sixtyfive p...,False,rick perry,rick perry says texas vaccination rate rose si...,dtap politifact rick perry texas department st...,inconnue,corrections updates health care public health,politifact
1,says approximately seventy percent jobs create...,False,rick perry,perry claims texas accounted seventy percent c...,alaska colorado district columbia economic pol...,inconnue,economy elections job accomplishments workers,politifact
2,says new estimates congressional budget office...,False,frank lobiondo,frank lobiondo says health care reform law cos...,congressional budget office frank lobiondo gal...,congressional budget office,health care,politifact
3,video shows dutch politician tunahan kuzu putt...,False,inconnue,dutch politician put grilled cheese sandwich p...,tunahan kuzu twitter grilled cheese sandwich i...,tunahan kuzu,grilled cheese tunahan kuzu,snopes
4,providence port 200 miles closer europe easter...,False,james bennett,incoming economic development director says pr...,angel taveras bureau transportation statistics...,inconnue,economy infrastructure trade transportation,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,66,66,66,66,66,66,66,66
unique,66,2,32,66,66,42,55,5
top,legislation currently proposal would result su...,false,inconnue,fox news discuss war christmas networks covere...,two thousand and seven general election art lo...,inconnue,inconnue,politifact
freq,1,33,33,1,1,25,7,32


At this point we have df, dfClean, dfCleanUpsample and dfCleanedDownsample

<div class="alert alert-block alert-info" align="center">
    <h1>
        Encodage
    </h1>
</div>

In [9]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer = TfidfVectorizer()

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

# How to use
#df = pd.DataFrame(standardScaler.fit_transform(df), columns = ['Name'])

# Splitting the dataframe

In [10]:
# df
dfHeadlineText = df["Headline"] + " " + df["Text"]
dfRatingName = df['RatingName']
dfAuthor = df['Author']
dfNamedEntitiesClaim = df['NamedEntitiesClaim']
dfNamedEntitiesArticle = df['NamedEntitiesArticle']
dfKeywords = df['Keywords']
dfSource = df['Source']

# dfClean
dfCleanHeadlineText = dfClean["Headline"] + " " + dfClean["Text"]
dfCleanRatingName = dfClean['RatingName']
dfCleanAuthor = dfClean['Author']
dfCleanNamedEntitiesClaim = dfClean['NamedEntitiesClaim']
dfCleanNamedEntitiesArticle = dfClean['NamedEntitiesArticle']
dfCleanKeywords = dfClean['Keywords']
dfCleanSource = dfClean['Source']

# dfCleanUpsample
dfCleanUpsampleHeadlineText = dfCleanUpsample["Headline"] + " " + dfCleanUpsample["Text"]
dfCleanUpsampleRatingName = dfCleanUpsample['RatingName']
dfCleanUpsampleAuthor = dfCleanUpsample['Author']
dfCleanUpsampleNamedEntitiesClaim = dfCleanUpsample['NamedEntitiesClaim']
dfCleanUpsampleNamedEntitiesArticle = dfCleanUpsample['NamedEntitiesArticle']
dfCleanUpsampleKeywords = dfCleanUpsample['Keywords']
dfCleanUpsampleSource = dfCleanUpsample['Source']

# dfCleanDownsample
dfCleanDownsampleHeadlineText = dfCleanDownsample["Headline"] + " " + dfCleanDownsample["Text"]
dfCleanDownsampleRatingName = dfCleanDownsample['RatingName']
dfCleanDownsampleAuthor = dfCleanDownsample['Author']
dfCleanDownsampleNamedEntitiesClaim = dfCleanDownsample['NamedEntitiesClaim']
dfCleanDownsampleNamedEntitiesArticle = dfCleanDownsample['NamedEntitiesArticle']
dfCleanDownsampleKeywords = dfCleanDownsample['Keywords']
dfCleanDownsampleSource = dfCleanDownsample['Source']

<div align="center">
    <h2>
        TF 1
    </h2>
</div>

## Make a copy of every column

In [11]:
dfHeadlineText1 = dfHeadlineText.copy()
dfAuthor1 = dfAuthor.copy()
dfNamedEntitiesClaim1 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle1 = dfNamedEntitiesArticle.copy()
dfKeywords1 = dfKeywords.copy()
dfSource1 = dfSource.copy()
dfRatingName1 = dfRatingName.copy()

## Transform data

In [12]:
dfHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])
dfHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText1), columns = ['HeadlineText'])

dfAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfAuthor1), columns = ['Author'])
dfAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfAuthor1), columns = ['Author'])

dfNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfKeywords1), columns = ['Keywords'])
dfKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfKeywords1), columns = ['Keywords'])

dfSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfSource1), columns = ['Source'])
dfSource1 = pd.DataFrame(standardScaler.fit_transform(dfSource1), columns = ['Source'])

dfRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName1), columns = ['RatingName'])

df1 = pd.concat([dfHeadlineText1, dfAuthor1, dfNamedEntitiesClaim1, dfNamedEntitiesArticle1, dfKeywords1, dfSource1, dfRatingName1], axis = 1)

display(df1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.164445,-0.020228,0.844223,1.477067,-1.433026,0.664691,0
1,0.291111,-2.312701,1.135334,-0.481540,-0.296403,-0.747778,0
2,0.611334,-0.020228,1.310001,0.861505,-1.176370,0.664691,0
3,0.087333,2.272245,0.582223,0.917465,-1.029709,-0.747778,1
4,-1.572001,-2.198077,-0.669556,-0.201739,-1.066374,-0.747778,0
...,...,...,...,...,...,...,...
114,1.397334,-0.020228,1.280890,-0.201739,1.463530,0.664691,0
115,-1.397334,0.438267,0.902445,-0.201739,-0.333069,-0.747778,1
116,1.018889,-1.739582,1.018889,-0.425580,-0.076412,-0.747778,0
117,1.630223,-0.020228,1.688445,1.756868,0.253576,2.077160,1


## Saving the transformed data

In [13]:
df1.to_csv('attemps/tf1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 1
    </h2>
</div>

## Make a copy of every column

In [14]:
dfCleanHeadlineText1 = dfCleanHeadlineText.copy()
dfCleanAuthor1 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim1 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle1 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords1 = dfCleanKeywords.copy()
dfCleanSource1 = dfCleanSource.copy()
dfCleanRatingName1 = dfCleanRatingName.copy()

## Transform data

In [15]:
dfCleanHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])
dfCleanHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText1), columns = ['HeadlineText'])

dfCleanAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanAuthor1), columns = ['Author'])
dfCleanAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanAuthor1), columns = ['Author'])

dfCleanNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanKeywords1), columns = ['Keywords'])
dfCleanKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanKeywords1), columns = ['Keywords'])

dfCleanSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanSource1), columns = ['Source'])
dfCleanSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanSource1), columns = ['Source'])

dfCleanRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName1), columns = ['RatingName'])

dfClean1 = pd.concat([dfCleanHeadlineText1, dfCleanAuthor1, dfCleanNamedEntitiesClaim1, dfCleanNamedEntitiesArticle1, dfCleanKeywords1, dfCleanSource1, dfCleanRatingName1], axis = 1)

display(dfClean1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.280890,-0.020228,0.407556,1.708674,-1.094474,0.664691,0
1,0.058222,-2.312701,0.756889,-0.377652,0.190584,-0.747778,0
2,0.378445,-0.020228,0.931556,0.975640,-0.990280,0.664691,0
3,0.582223,2.272245,0.145556,1.088415,-1.650175,-0.747778,1
4,-1.630223,-2.198077,-1.135334,-0.095716,-1.684906,-0.747778,0
...,...,...,...,...,...,...,...
114,1.426445,-0.020228,0.902445,-0.095716,0.537897,0.664691,0
115,-1.484667,0.438267,1.222667,-0.095716,0.155853,-0.747778,1
116,1.106223,-1.739582,0.611334,-0.208490,0.433704,-0.747778,0
117,-1.688445,-0.020228,1.717556,2.103384,1.024136,2.077160,1


## Saving the transformed data

In [16]:
dfClean1.to_csv('attemps/tfclean1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 1
    </h2>
</div>

## Make a copy of every column

In [17]:
dfCleanUpsampleHeadlineText1 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor1 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim1 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle1 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords1 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource1 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName1 = dfCleanUpsampleRatingName.copy()

## Transform data

In [18]:
dfCleanUpsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanUpsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanUpsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])
dfCleanUpsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleAuthor1), columns = ['Author'])

dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanUpsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])
dfCleanUpsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleKeywords1), columns = ['Keywords'])

dfCleanUpsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])
dfCleanUpsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleSource1), columns = ['Source'])

dfCleanUpsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName1), columns = ['RatingName'])

dfCleanUpsample1 = pd.concat([dfCleanUpsampleHeadlineText1, dfCleanUpsampleAuthor1, dfCleanUpsampleNamedEntitiesClaim1, dfCleanUpsampleNamedEntitiesArticle1, dfCleanUpsampleKeywords1, dfCleanUpsampleSource1, dfCleanUpsampleRatingName1], axis = 1)

display(dfCleanUpsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-1.363832,0.471882,1.223262,-0.082006,0.133010,-0.711428,1
1,-1.277547,-0.020019,-1.305597,2.163273,1.543511,0.728167,1
2,1.598606,-0.020019,0.657988,-0.082006,0.958669,2.167763,0
3,0.419383,1.332710,0.360475,0.263422,1.096279,-0.711428,0
4,-1.047455,-1.372749,1.639781,-0.082006,1.302694,-0.711428,1
...,...,...,...,...,...,...,...
167,0.649476,-1.741675,1.461273,1.299704,0.408229,-0.711428,0
168,0.965852,1.947587,0.062962,1.357275,-0.589442,-0.711428,1
169,-0.932409,-0.757872,1.431521,1.299704,-0.486235,-0.711428,1
170,-1.565163,-0.020019,-0.978333,-1.751571,-1.518309,0.728167,0


## Saving the transformed data

In [19]:
dfCleanUpsample1.to_csv('attemps/tfcleanupsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 1
    </h2>
</div>

## Make a copy of every column

In [20]:
dfCleanDownsampleHeadlineText1 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor1 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim1 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle1 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords1 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource1 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName1 = dfCleanDownsampleRatingName.copy()

## Transform data

In [21]:
dfCleanDownsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])
dfCleanDownsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText1), columns = ['HeadlineText'])

dfCleanDownsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])
dfCleanDownsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleAuthor1), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])
dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), columns = ['NamedEntitiesClaim'])

dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])
dfCleanDownsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleKeywords1), columns = ['Keywords'])

dfCleanDownsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])
dfCleanDownsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleSource1), columns = ['Source'])

dfCleanDownsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName1), columns = ['RatingName'])

dfCleanDownsample1 = pd.concat([dfCleanDownsampleHeadlineText1, dfCleanDownsampleAuthor1, dfCleanDownsampleNamedEntitiesClaim1, dfCleanDownsampleNamedEntitiesArticle1, dfCleanDownsampleKeywords1, dfCleanDownsampleSource1, dfCleanDownsampleRatingName1], axis = 1)

display(dfCleanDownsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,0.813632,1.468696,0.446186,-0.098447,-0.419071,-0.712677,0
1,0.393693,1.468696,-1.286064,-0.098447,-0.041907,-0.712677,0
2,-0.393693,-0.494141,0.183723,-1.026662,0.901003,-0.712677,0
3,-0.813632,-0.041178,1.338557,1.757982,0.775282,0.670755,0
4,-0.026246,0.260797,-1.076094,-0.098447,0.020954,-0.712677,0
...,...,...,...,...,...,...,...
61,-0.183723,-0.041178,-1.023602,-0.098447,-1.236260,0.670755,1
62,-0.971110,-2.155003,-1.128587,-0.098447,-0.167628,-0.712677,1
63,-1.391049,0.562771,1.233572,-0.098447,0.083814,-0.712677,1
64,-1.706003,-0.041178,1.706003,2.067387,1.026724,2.054187,1


## Saving the transformed data

In [22]:
dfCleanDownsample1.to_csv('attemps/tfcleandownsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF2
    </h2>
</div>

## Make a copy of every column

In [23]:
dfHeadlineText2 = dfHeadlineText.copy()
dfAuthor2 = dfAuthor.copy()
dfNamedEntitiesClaim2 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle2 = dfNamedEntitiesArticle.copy()
dfKeywords2 = dfKeywords.copy()
dfSource2 = dfSource.copy()
dfRatingName2 = dfRatingName.copy()

## Transform data

In [24]:
dfHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfSource2 = pd.get_dummies(dfSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName2), columns = ['RatingName'])

df2 = pd.concat([dfHeadlineText2, dfSource2, dfRatingName2], axis = 1)

display(df2)

Unnamed: 0,000,100,106,11,120,14,15th,16,19,1940s,...,york,you,your,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,0,1,0,0
1,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,10.862780,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,1,0,0,0
2,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,10.862780,-0.092057,10.862780,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,0,1,0,0
3,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,1,0,0,1
4,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,0,1,0,0
115,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,1,0,0,1
116,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,5.095025,-0.130079,-0.092057,-0.092057,0,0,1,0,0,0
117,-0.205737,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125173,-0.130079,-0.092057,-0.092057,0,0,0,0,1,1


## Saving the transformed data

In [25]:
df2.to_csv('attemps/tf2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 2
    </h2>
</div>

## Make a copy of every column

In [26]:
dfCleanHeadlineText2 = dfCleanHeadlineText.copy()
dfCleanAuthor2 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim2 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle2 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords2 = dfCleanKeywords.copy()
dfCleanSource2 = dfCleanSource.copy()
dfCleanRatingName2 = dfCleanRatingName.copy()

## Transform data

In [27]:
dfCleanHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanSource2 = pd.get_dummies(dfCleanSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName2), columns = ['RatingName'])

dfClean2 = pd.concat([dfCleanHeadlineText2, dfCleanSource2, dfCleanRatingName2], axis = 1)

display(dfClean2)

Unnamed: 0,106000,1270,15th,1729000000,1940s,200,20072008,30000,400000,5000,...,year,years,york,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,0,1,0,0
1,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,4.487271,-0.124602,-0.092057,0,0,1,0,0,0
2,-0.092057,-0.092057,10.862780,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,0,1,0,0
3,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,1,0,0,1
4,-0.092057,10.862780,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,10.862780,-0.175499,-0.124602,-0.092057,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,0,1,0,0
115,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,1,0,0,1
116,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,4.952727,-0.092057,0,0,1,0,0,0
117,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.092057,-0.175499,-0.124602,-0.092057,0,0,0,0,1,1


## Saving the transformed data

In [28]:
dfClean2.to_csv('attemps/tfclean2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 2
    </h2>
</div>

## Make a copy of every column

In [29]:
dfCleanUpsampleHeadlineText2 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor2 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim2 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle2 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords2 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource2 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName2 = dfCleanUpsampleRatingName.copy()

## Transform data

In [30]:
dfCleanUpsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanUpsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanUpsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanUpsampleSource2 = pd.get_dummies(dfCleanUpsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanUpsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName2), columns = ['RatingName'])

dfCleanUpsample2 = pd.concat([dfCleanUpsampleHeadlineText2, dfCleanUpsampleSource2, dfCleanUpsampleRatingName2], axis = 1)

display(dfCleanUpsample2)

Unnamed: 0,1270,15th,1729000000,1940s,200,20072008,30000,400000,5000,70th,...,year,years,york,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,1,0,0,1
1,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,0,1,0,1
2,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,0,0,1,0
3,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,1,0,0,0
4,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,2.706824,-0.12805,-0.076472,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,1,0,0,0
168,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,7.505553,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,1,0,0,1
169,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,1,0,0,1
170,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.076472,-0.108465,-0.076472,-0.076472,-0.133235,...,-0.076472,-0.231640,-0.12805,-0.076472,0,0,0,1,0,0


## Saving the transformed data

In [31]:
dfCleanUpsample2.to_csv('attemps/tfcleanupsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 2
    </h2>
</div>

## Make a copy of every column

In [32]:
dfCleanDownsampleHeadlineText2 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor2 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim2 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle2 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords2 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource2 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName2 = dfCleanDownsampleRatingName.copy()

## Transform data

In [33]:
dfCleanDownsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleHeadlineText2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleSource2 = pd.get_dummies(dfCleanDownsampleSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanDownsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName2), columns = ['RatingName'])

# Pas encore traité
dfCleanDownsampleAuthor2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])
dfCleanDownsampleAuthor2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleAuthor2), columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(tfidfVectorizer.fit_transform(dfCleanDownsampleNamedEntitiesClaim2).toarray(), columns = tfidfVectorizer.get_feature_names())
dfCleanDownsampleNamedEntitiesClaim2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim2), columns = tfidfVectorizer.get_feature_names())

dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle2), columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])
dfCleanDownsampleKeywords2 = pd.DataFrame(minMaxScaler.fit_transform(dfCleanDownsampleKeywords2), columns = ['Keywords'])

dfCleanDownsample2 = pd.concat([dfCleanDownsampleHeadlineText2, dfCleanDownsampleSource2, dfCleanDownsampleRatingName2], axis = 1)

display(dfCleanDownsample2)

Unnamed: 0,106000,1729000000,200,20072008,30000,5000,70th,72b,94,991,...,yeah,years,york,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,1,0,0,0
1,-0.124035,-0.124035,-0.124035,8.062258,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,1,0,0,0
2,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,1,0,0,0
3,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,0,1,0,0
4,-0.124035,-0.124035,8.062258,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,0,1,0,1
62,-0.124035,-0.124035,-0.124035,-0.124035,8.062258,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,8.062258,-0.124035,0,0,1,0,0,1
63,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,1,0,0,1
64,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,-0.124035,...,-0.124035,-0.207682,-0.124035,-0.124035,0,0,0,0,1,1


## Saving the transformed data

In [34]:
dfCleanDownsample2.to_csv('attemps/tfcleandownsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF3
    </h2>
</div>

## Make a copy of every column

In [35]:
dfHeadlineText3 = dfHeadlineText.copy()
dfAuthor3 = dfAuthor.copy()
dfNamedEntitiesClaim3 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle3 = dfNamedEntitiesArticle.copy()
dfKeywords3 = dfKeywords.copy()
dfSource3 = dfSource.copy()
dfRatingName3 = dfRatingName.copy()

## Transform data

In [36]:
dfHeadlineText3 = pd.DataFrame(tfidfVectorizer.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(tfidfTransformer1.fit_transform(dfHeadlineText3).toarray(), columns = tfidfVectorizer.get_feature_names())
dfHeadlineText3 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText3), columns = tfidfVectorizer.get_feature_names())

dfSource3 = pd.get_dummies(dfSource3, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName3 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName3), columns = ['RatingName'])

df3 = pd.concat([dfHeadlineText3, dfSource3, dfRatingName3], axis = 1)

display(df3)

Unnamed: 0,000,100,106,11,120,14,15th,16,19,1940s,...,york,you,your,zambian,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,0,1,0,0
1,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,10.862780,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,1,0,0,0
2,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,10.862780,-0.092057,10.862780,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,0,1,0,0
3,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,1,0,0,1
4,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,0,1,0,0
115,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,1,0,0,1
116,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,5.234153,-0.130407,-0.092057,-0.092057,0,0,1,0,0,0
117,-0.205593,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,-0.092057,...,-0.125710,-0.130407,-0.092057,-0.092057,0,0,0,0,1,1


## Saving the transformed data

In [37]:
df3.to_csv('attemps/tf3.csv', sep = ';', index = False)