<div align="center"><h1> Data Science Project </h1></div>
<div align="center"><h2> Classification of assertions according to their veracity values ( automatic fact-checking ) </h2></div>
<h2>Group member</h2>
<ul>
    <li>Meriem AMERAOUI</li>
    <li>Dounia BELABIOD</li>
    <li>Jihene BOUHLEL</li>
    <li>Bahaa Eddine NIL</li>
</ul>

<div class="alert alert-block alert-info" align="center">
    <h1>
        Basics
    </h1>
</div>

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import inflect
import re
import nltk
import contractions

from functools import reduce
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Cleaning the text
def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word) 
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def separate_letter_number(words):
    new_words = []
    for word in words:
        nw = re.findall('\d+|\D+', word)
        new_words.append(nw)
    new_words = reduce(lambda x,y: x+y,new_words)
    return new_words

def replace_contractions(text):
    return 

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    #words = separate_letter_number(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def clean_text(text):
    porterStemmer = PorterStemmer()
    lancasterStemmer = LancasterStemmer()
    wordNetLemmatizer = WordNetLemmatizer()
    #tokens = word_tokenize(contractions.fix(text))
    tokens = word_tokenize(text)
    tokens = normalize(tokens)
    #tokens = [porterStemmer.stem(word) for word in tokens]
    #tokens = [lancasterStemmer.stem(word) for word in tokens]
    #tokens = [wordNetLemmatizer.lemmatize(word, pos = 'v') for word in tokens]
    text="".join([" "+i for i in tokens]).strip()
    return text

# Up-sampling & Down-sampling
def sampling(df_func, *args, **kwargs):
    sampling_type = kwargs.get("sample", None)
    if(sampling_type not in ['up', 'down']):
        print('Please select somthing in [\'up\', \'down\']')

    else:
        majority = df_func[df_func.RatingName == df_func['RatingName'].value_counts()
                           .index.tolist()[0]].reset_index(drop = True)
        minority = df_func[df_func.RatingName == df_func['RatingName'].value_counts()
                           .index.tolist()[-1]].reset_index(drop = True)

        if(sampling_type == 'up'):
            df_func = resample(minority, replace = True, n_samples = df_func['RatingName']
                               .value_counts().tolist()[0], random_state = 123)
            df_func = pd.concat([majority, df_func]).sample(frac = 1).reset_index(drop = True)

        if(sampling_type == 'down'):
            df_func = resample(majority, replace=False, n_samples=df_func['RatingName']
                               .value_counts().tolist()[-1], random_state=123) 
            df_func = pd.concat([df_func, minority]).reset_index(drop = True)

        return df_func

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Reading data
    </h1>
</div>

In [2]:
df = pd.read_csv('datasets/generated.csv', sep = ';')

<div class="alert alert-block alert-info" align="center">
    <h1>
        Pre-processing
    </h1>
</div>

## Remove unnecessary columns

In [3]:
df = df.drop(['ID', 'Date', 'TruthRating', 'SourceURL', 'Link', 'Language'], axis = 1)
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,Malia Obama cashed a $1.2 million tax refund c...,OTHER,Unknown,Did Malia Obama Cash a $1.2 Million Check?,"Facebook,Fan Fiction,Junk News,Malia Obama,Sno...",Malia Obama,,truthorfiction
1,High diver is saved from jumping into a draine...,OTHER,Unknown,High Diver Saved By Cross,"Cincinnati Post,Islam,Scripture lesson,Univers...",shadow on the wall,ASP Article,snopes
2,'And the revenue generated by drilling off Vir...,MIXTURE,Jim Moran,Moran says drilling off Virginia's coast will ...,"Alaska,American Petroleum Institute,Atlantic O...",,"Energy,State Finances",politifact
3,Health insurance companies pay CEOs $24 millio...,MIXTURE,Health Care for America Now,Health care advocacy group blasts insurers for...,"Aetna,Assurant,Bloomberg News,Cigna,Coventry H...",,"Corporations,Health Care",politifact
4,Ted Cruz said that veterans should start selli...,FALSE,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes


## Remove unnecessary rows

In [4]:
# Deleting claims with OTHER é MIXTURE RatingName
df = df[df.RatingName != 'OTHER']
df = df[df.RatingName != 'MIXTURE']

display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,Ted Cruz said that veterans should start selli...,False,Unknown,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes
7,'The Georgia Lottery Corp. has only once in th...,False,Allie McCullen,Luck runs out for student on lottery claim,"Georgia Lottery,HOPE scholarship,Mega Millions...","Georgia Lottery,HOPE scholarship",Education,politifact
9,Panda Express will be celebrating its 15th ann...,False,Unknown,Panda Express 15th Anniversary Offer,"Los Angeles Times,Panda Express,Panda Inn,Wend...",Panda Express,"ASP Article, something for nothing",snopes
10,'There’s no money' for Planned Parenthood in t...,True,Tom Cole,Is there Planned Parenthood funding in the bil...,"Children’s Health Insurance Program,Fox News S...",Planned Parenthood,"Abortion,Congress,Congressional Rules,Federal ...",politifact
11,"'In 2010 alone, 1,270 infants were reported to...",False,Americans United for Life,"Americans United for Life says 1,270 babies di...","Alan Guttmacher Institute,Americans United for...",,Abortion,politifact


## Replacing "Unknown" & NaN by "Inconnue"

In [5]:
for column in df.columns:
    df[column].replace(to_replace = 'Unknown', value = 'Inconnue', inplace = True)
    df[column].replace(np.NaN, 'Inconnue', inplace = True)
    
display(df.head())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,Ted Cruz said that veterans should start selli...,False,Inconnue,Ted Cruz: Vets Should Sell Cookies for Funding...,"David Nelson,James Morrison,John Scalzi,Republ...",Ted Cruz,"ASP Article, Not Necessarily The News",snopes
7,'The Georgia Lottery Corp. has only once in th...,False,Allie McCullen,Luck runs out for student on lottery claim,"Georgia Lottery,HOPE scholarship,Mega Millions...","Georgia Lottery,HOPE scholarship",Education,politifact
9,Panda Express will be celebrating its 15th ann...,False,Inconnue,Panda Express 15th Anniversary Offer,"Los Angeles Times,Panda Express,Panda Inn,Wend...",Panda Express,"ASP Article, something for nothing",snopes
10,'There’s no money' for Planned Parenthood in t...,True,Tom Cole,Is there Planned Parenthood funding in the bil...,"Children’s Health Insurance Program,Fox News S...",Planned Parenthood,"Abortion,Congress,Congressional Rules,Federal ...",politifact
11,"'In 2010 alone, 1,270 infants were reported to...",False,Americans United for Life,"Americans United for Life says 1,270 babies di...","Alan Guttmacher Institute,Americans United for...",Inconnue,Abortion,politifact


## Cleaning

In [6]:
dfClean = df.copy()
for column in dfClean.columns:
    dfClean[column] = dfClean[column].apply(lambda x: clean_text(x))

display(dfClean.head())
display(dfClean.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
4,ted cruz said veterans start selling cookies o...,False,inconnue,ted cruz vets sell cookies funding like girl s...,david nelson james morrison john scalzi republ...,ted cruz,asp article necessarily news,snopes
7,georgia lottery corp past sixteen years paid a...,False,allie mccullen,luck runs student lottery claim,georgia lottery hope scholarship mega millions...,georgia lottery hope scholarship,education,politifact
9,panda express celebrating 15th anniversary off...,False,inconnue,panda express 15th anniversary offer,los angeles times panda express panda inn wend...,panda express,asp article something nothing,snopes
10,money planned parenthood bill would keep gover...,True,tom cole,planned parenthood funding bill stops governme...,children health insurance program fox news sun...,planned parenthood,abortion congress congressional rules federal ...,politifact
11,two thousand and ten alone 1270 infants report...,False,americans united life,americans united life says 1270 babies died at...,alan guttmacher institute americans united lif...,inconnue,abortion,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,488,488,488,488,488,488,488,488
unique,488,2,156,488,488,300,358,5
top,striped mittenfish change sex turning entire b...,false,inconnue,seven hollywood liberal heroes arrested oscar ...,two thousand and nine college basketball invit...,inconnue,inconnue,snopes
freq,1,358,282,1,1,163,28,261


## Upsampling

In [7]:
dfCleanUpsample = sampling(dfClean, sample = 'up')
display(dfCleanUpsample.head())
display(dfCleanUpsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,photographs show furniture floating automobile...,True,inconnue,wood art,craftsmanship museum ferrari ferrari f50 internet,inconnue,asp article stephanie cegielski,snopes
1,photograph show aftermath bombing vatican city,False,inconnue,bombing take place vatican city,islam quran ramadan twitter vatican city vigil...,vatican city,bombing fire smoke vatican,snopes
2,actor gary sinise posted witty retort hillary ...,False,inconnue,gary sinise tweets hillary clinton guns,facebook gary sinise hillary clinton twitter g...,gary sinise hillary clinton gun control,gary sinise gun control hillary clinton,snopes
3,right georgia nearly one three leaving prisons...,True,nathan deal,deal makes arresting claim georgia prisons,medicaid nathan deal politifact chew recidivism,inconnue,criminal justice,politifact
4,health insurance companies costs four percent ...,True,america health insurance plans,health insurers get small percentage overall h...,america health insurance plans centers medicar...,health insurance,health care,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,716,716,716,716,716,716,716,716
unique,481,2,154,481,481,298,355,5
top,says people signed recall petitions wisconsin ...,true,inconnue,wisconsin tea party leader says democrathired ...,democrat eagle river jim holperin politifact r...,inconnue,inconnue,politifact
freq,11,358,369,11,11,278,38,345


## Downsampling

In [8]:
dfCleanDownsample = sampling(dfClean, sample = 'down')
display(dfCleanDownsample.head())
display(dfCleanDownsample.describe())

Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
0,says 74 percent smallbusiness people believe o...,False,larry elder,claim nt pass examination,cnn lemon gallup poll larry elder marco rubio ...,bad,health care,politifact
1,giant asteroid expected hit earth july two tho...,False,inconnue,scientists say giant asteroid could hit earth ...,asteroid columbia university facebook asteroid...,asteroid,asteroid clickbait,snopes
2,image shows radioactive seepage spreading acro...,False,inconnue,fukushima radioactive water leak chart,antarctica chernobyl nuclear power plant cs137...,pacific ocean,asp article fukushima radiation,snopes
3,photographs show toys christmas drives needy e...,False,inconnue,walmart returns full donated toys,asheville north carolina facebook toys tots wl...,walmart,asp article,snopes
4,providence port 200 miles closer europe easter...,False,james bennett,incoming economic development director says pr...,angel taveras bureau transportation statistics...,inconnue,economy infrastructure trade transportation,politifact


Unnamed: 0,Text,RatingName,Author,Headline,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source
count,260,260,260,260,260,260,260,260
unique,260,2,108,260,260,152,192,5
top,striped mittenfish change sex turning entire b...,false,inconnue,amc walking dead agrees bring beth greene back...,two thousand and nine college basketball invit...,inconnue,inconnue,politifact
freq,1,130,134,1,1,99,18,124


At this point we have df, dfClean, dfCleanUpsample and dfCleaneDownsample

<div class="alert alert-block alert-info" align="center">
    <h1>
        Encodage
    </h1>
</div>

In [9]:
classLabelEncoder = LabelEncoder()

tfidfVectorizer1 = TfidfVectorizer()
tfidfVectorizer2 = TfidfVectorizer(ngram_range=(1, 2))

tfidfTransformer1 = TfidfTransformer()
tfidfTransformer2 = TfidfTransformer(use_idf = False)

countVectorizer = CountVectorizer()

standardScaler = StandardScaler()
minMaxScaler = MinMaxScaler()

## Splitting the dataframe & make copies

In [10]:
# df
dfHeadlineText = df["Headline"] + " " + df["Text"]
dfRatingName = df['RatingName']
dfAuthor = df['Author']
dfNamedEntitiesClaim = df['NamedEntitiesClaim']
dfNamedEntitiesArticle = df['NamedEntitiesArticle']
dfKeywords = df['Keywords']
dfSource = df['Source']

# dfClean
dfCleanHeadlineText = dfClean["Headline"] + " " + dfClean["Text"]
dfCleanRatingName = dfClean['RatingName']
dfCleanAuthor = dfClean['Author']
dfCleanNamedEntitiesClaim = dfClean['NamedEntitiesClaim']
dfCleanNamedEntitiesArticle = dfClean['NamedEntitiesArticle']
dfCleanKeywords = dfClean['Keywords']
dfCleanSource = dfClean['Source']

# dfCleanUpsample
dfCleanUpsampleHeadlineText = dfCleanUpsample["Headline"] + " " + dfCleanUpsample["Text"]
dfCleanUpsampleRatingName = dfCleanUpsample['RatingName']
dfCleanUpsampleAuthor = dfCleanUpsample['Author']
dfCleanUpsampleNamedEntitiesClaim = dfCleanUpsample['NamedEntitiesClaim']
dfCleanUpsampleNamedEntitiesArticle = dfCleanUpsample['NamedEntitiesArticle']
dfCleanUpsampleKeywords = dfCleanUpsample['Keywords']
dfCleanUpsampleSource = dfCleanUpsample['Source']

# dfCleanDownsample
dfCleanDownsampleHeadlineText = dfCleanDownsample["Headline"] + " " + dfCleanDownsample["Text"]
dfCleanDownsampleRatingName = dfCleanDownsample['RatingName']
dfCleanDownsampleAuthor = dfCleanDownsample['Author']
dfCleanDownsampleNamedEntitiesClaim = dfCleanDownsample['NamedEntitiesClaim']
dfCleanDownsampleNamedEntitiesArticle = dfCleanDownsample['NamedEntitiesArticle']
dfCleanDownsampleKeywords = dfCleanDownsample['Keywords']
dfCleanDownsampleSource = dfCleanDownsample['Source']

<div align="center">
    <h2>
        TF 1
    </h2>
</div>

## Make a copy of every column

In [11]:
dfHeadlineText1 = dfHeadlineText.copy()
dfAuthor1 = dfAuthor.copy()
dfNamedEntitiesClaim1 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle1 = dfNamedEntitiesArticle.copy()
dfKeywords1 = dfKeywords.copy()
dfSource1 = dfSource.copy()
dfRatingName1 = dfRatingName.copy()

## Transform data

In [12]:
dfHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfHeadlineText1), 
                               columns = ['HeadlineText'])
dfHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText1), 
                               columns = ['HeadlineText'])

dfAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfAuthor1), 
                         columns = ['Author'])
dfAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfAuthor1), 
                         columns = ['Author'])

dfNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesClaim1), 
                                     columns = ['NamedEntitiesClaim'])
dfNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesClaim1), 
                                     columns = ['NamedEntitiesClaim'])

dfNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfNamedEntitiesArticle1), 
                                       columns = ['NamedEntitiesArticle'])
dfNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfNamedEntitiesArticle1), 
                                       columns = ['NamedEntitiesArticle'])

dfKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfKeywords1), 
                           columns = ['Keywords'])
dfKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfKeywords1), 
                           columns = ['Keywords'])

dfSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfSource1), 
                         columns = ['Source'])
dfSource1 = pd.DataFrame(standardScaler.fit_transform(dfSource1), 
                         columns = ['Source'])

dfRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName1), 
                             columns = ['RatingName'])

df1 = pd.concat([dfHeadlineText1, dfAuthor1, dfNamedEntitiesClaim1, dfNamedEntitiesArticle1, 
                 dfKeywords1, dfSource1, dfRatingName1], axis = 1)

display(df1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.238703,-0.049871,1.018647,1.343932,-1.392961,0.718769,0
1,0.280394,-2.217558,1.366477,-0.328660,-0.259501,-0.799669,0
2,0.734703,-0.049871,1.508449,0.851993,-1.152226,0.718769,0
3,0.102929,2.544784,0.791492,0.894159,-1.092043,-0.799669,1
4,-1.622026,-2.151870,-0.763098,-0.117829,-1.112104,-0.799669,0
...,...,...,...,...,...,...,...
483,-0.330084,-1.790589,-0.202310,-1.200094,-0.259501,-0.799669,0
484,-1.501350,-1.954808,0.060338,-0.708156,-0.460114,-0.799669,0
485,-0.649520,-1.593527,0.464957,-0.117829,-0.379869,-0.799669,1
486,1.508449,-0.049871,-1.316787,0.177334,1.495857,0.718769,0


## Saving the transformed data

In [13]:
df1.to_csv('attemps/tf1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 1
    </h2>
</div>

## Make a copy of every column

In [14]:
dfCleanHeadlineText1 = dfCleanHeadlineText.copy()
dfCleanAuthor1 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim1 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle1 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords1 = dfCleanKeywords.copy()
dfCleanSource1 = dfCleanSource.copy()
dfCleanRatingName1 = dfCleanRatingName.copy()

## Transform data

In [15]:
dfCleanHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanHeadlineText1), 
                                    columns = ['HeadlineText'])
dfCleanHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText1), 
                                    columns = ['HeadlineText'])

dfCleanAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanAuthor1), 
                              columns = ['Author'])
dfCleanAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanAuthor1), 
                              columns = ['Author'])

dfCleanNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesClaim1), 
                                          columns = ['NamedEntitiesClaim'])
dfCleanNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesClaim1), 
                                          columns = ['NamedEntitiesClaim'])

dfCleanNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanNamedEntitiesArticle1), 
                                            columns = ['NamedEntitiesArticle'])
dfCleanNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanNamedEntitiesArticle1), 
                                            columns = ['NamedEntitiesArticle'])

dfCleanKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanKeywords1), 
                                columns = ['Keywords'])
dfCleanKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanKeywords1), 
                                columns = ['Keywords'])

dfCleanSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanSource1), 
                              columns = ['Source'])
dfCleanSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanSource1), 
                              columns = ['Source'])

dfCleanRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName1), 
                                  columns = ['RatingName'])

dfClean1 = pd.concat([dfCleanHeadlineText1, dfCleanAuthor1, dfCleanNamedEntitiesClaim1, dfCleanNamedEntitiesArticle1, 
                      dfCleanKeywords1, dfCleanSource1, dfCleanRatingName1], axis = 1)

display(dfClean1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.316787,-0.051841,0.578535,1.635848,-1.148052,0.718769,0
1,0.039042,-2.192984,0.961858,-0.256948,0.429747,-0.799669,0
2,0.521746,-0.051841,1.103830,1.070834,-1.036019,0.718769,0
3,0.649520,2.550472,0.301690,1.141461,-1.652201,-0.799669,1
4,-1.614928,-2.127103,-1.252900,-0.002692,-1.670874,-0.799669,0
...,...,...,...,...,...,...,...
483,-0.614027,-1.764756,-0.663718,-0.836087,0.429747,-0.799669,0
484,-1.529745,-1.929459,-0.401070,-0.680708,0.037631,-0.799669,0
485,0.564337,-1.567112,0.024845,-0.002692,0.289706,-0.799669,1
486,-0.145521,-0.051841,1.558139,0.279815,0.485764,0.718769,0


## Saving the transformed data

In [16]:
dfClean1.to_csv('attemps/tfclean1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 1
    </h2>
</div>

## Make a copy of every column

In [17]:
dfCleanUpsampleHeadlineText1 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor1 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim1 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle1 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords1 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource1 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName1 = dfCleanUpsampleRatingName.copy()

## Transform data

In [18]:
dfCleanUpsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleHeadlineText1), 
                                            columns = ['HeadlineText'])
dfCleanUpsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText1), 
                                            columns = ['HeadlineText'])

dfCleanUpsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleAuthor1), 
                                      columns = ['Author'])
dfCleanUpsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleAuthor1), 
                                      columns = ['Author'])

dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), 
                                                  columns = ['NamedEntitiesClaim'])
dfCleanUpsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesClaim1), 
                                                  columns = ['NamedEntitiesClaim'])

dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), 
                                                    columns = ['NamedEntitiesArticle'])
dfCleanUpsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleNamedEntitiesArticle1), 
                                                    columns = ['NamedEntitiesArticle'])

dfCleanUpsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleKeywords1), 
                                        columns = ['Keywords'])
dfCleanUpsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleKeywords1), 
                                        columns = ['Keywords'])

dfCleanUpsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleSource1), 
                                      columns = ['Source'])
dfCleanUpsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleSource1), 
                                      columns = ['Source'])

dfCleanUpsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName1), 
                                          columns = ['RatingName'])

dfCleanUpsample1 = pd.concat([dfCleanUpsampleHeadlineText1, dfCleanUpsampleAuthor1, dfCleanUpsampleNamedEntitiesClaim1, 
                              dfCleanUpsampleNamedEntitiesArticle1, dfCleanUpsampleKeywords1, dfCleanUpsampleSource1, 
                              dfCleanUpsampleRatingName1], axis = 1)

display(dfCleanUpsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,1.649966,-0.065213,0.469314,-0.020832,-1.051105,0.822997,1
1,-1.410355,-0.065213,1.081349,2.056934,-0.779915,0.822997,0
2,-0.548094,-0.065213,0.814934,-0.347338,0.837878,0.822997,0
3,-1.049901,1.290993,1.174955,-0.020832,-0.050505,-0.711549,1
4,-0.406739,-2.131812,-1.064374,-0.228608,0.931392,-0.711549,1
...,...,...,...,...,...,...,...
711,-1.678928,-0.065213,1.736587,2.131140,1.034257,2.357542,1
712,1.240039,2.614908,-0.157122,1.700745,0.847229,-0.711549,0
713,-1.572912,-0.065213,-0.553145,-0.020832,1.034257,0.822997,1
714,0.533267,-0.065213,-1.165180,-0.020832,1.034257,0.822997,0


## Saving the transformed data

In [19]:
dfCleanUpsample1.to_csv('attemps/tfcleanupsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 1
    </h2>
</div>

## Make a copy of every column

In [20]:
dfCleanDownsampleHeadlineText1 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor1 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim1 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle1 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords1 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource1 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName1 = dfCleanDownsampleRatingName.copy()

## Transform data

In [21]:
dfCleanDownsampleHeadlineText1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleHeadlineText1), 
                                              columns = ['HeadlineText'])
dfCleanDownsampleHeadlineText1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText1), 
                                              columns = ['HeadlineText'])

dfCleanDownsampleAuthor1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleAuthor1), 
                                        columns = ['Author'])
dfCleanDownsampleAuthor1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleAuthor1), 
                                        columns = ['Author'])

dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), 
                                                    columns = ['NamedEntitiesClaim'])
dfCleanDownsampleNamedEntitiesClaim1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesClaim1), 
                                                    columns = ['NamedEntitiesClaim'])

dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), 
                                                      columns = ['NamedEntitiesArticle'])
dfCleanDownsampleNamedEntitiesArticle1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleNamedEntitiesArticle1), 
                                                      columns = ['NamedEntitiesArticle'])

dfCleanDownsampleKeywords1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleKeywords1), 
                                          columns = ['Keywords'])
dfCleanDownsampleKeywords1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleKeywords1), 
                                          columns = ['Keywords'])

dfCleanDownsampleSource1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleSource1), 
                                        columns = ['Source'])
dfCleanDownsampleSource1 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleSource1), 
                                        columns = ['Source'])

dfCleanDownsampleRatingName1 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName1), 
                                            columns = ['RatingName'])

dfCleanDownsample1 = pd.concat([dfCleanDownsampleHeadlineText1, dfCleanDownsampleAuthor1, 
                                dfCleanDownsampleNamedEntitiesClaim1, dfCleanDownsampleNamedEntitiesArticle1, 
                                dfCleanDownsampleKeywords1, dfCleanDownsampleSource1, dfCleanDownsampleRatingName1], 
                               axis = 1)

display(dfCleanDownsample1)

Unnamed: 0,HeadlineText,Author,NamedEntitiesClaim,NamedEntitiesArticle,Keywords,Source,RatingName
0,-1.125841,0.519433,0.526281,-1.703285,0.893189,-0.760398,0
1,1.072547,-0.150249,-0.539604,-1.731990,-0.919887,0.784160,0
2,-0.552928,-0.150249,-0.752781,1.052354,-1.265234,0.784160,0
3,1.605490,-0.150249,-0.632869,1.970900,-1.386106,0.784160,0
4,-0.166545,-0.060958,-0.792752,-0.038420,0.409702,-0.760398,0
...,...,...,...,...,...,...,...
255,1.019253,1.858797,0.699487,1.425513,-0.678143,-0.760398,1
256,-0.393045,-1.980714,-0.965959,-0.268057,0.893189,-0.760398,1
257,-0.632869,-0.864577,-0.246486,0.736603,0.116156,-0.760398,1
258,0.566252,-1.534259,0.099927,-0.038420,0.323365,-0.760398,1


## Saving the transformed data

In [22]:
dfCleanDownsample1.to_csv('attemps/tfcleandownsample1.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF2
    </h2>
</div>

## Make a copy of every column

In [23]:
dfHeadlineText2 = dfHeadlineText.copy()
dfAuthor2 = dfAuthor.copy()
dfNamedEntitiesClaim2 = dfNamedEntitiesClaim.copy()
dfNamedEntitiesArticle2 = dfNamedEntitiesArticle.copy()
dfKeywords2 = dfKeywords.copy()
dfSource2 = dfSource.copy()
dfRatingName2 = dfRatingName.copy()

## Transform data

In [24]:
dfHeadlineText2 = pd.DataFrame(tfidfVectorizer1.fit_transform(dfHeadlineText2).toarray(), 
                               columns = tfidfVectorizer1.get_feature_names())
dfHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfHeadlineText2), 
                               columns = tfidfVectorizer1.get_feature_names())

dfSource2 = pd.get_dummies(dfSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfRatingName2), columns = ['RatingName'])

df2 = pd.concat([dfHeadlineText2, dfSource2, dfRatingName2], axis = 1)

display(df2)

Unnamed: 0,000,10,100,100k,101st,102,106,11,120,13,...,zimmerman,zipper,zippered,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0
1,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
2,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0
3,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,1
4,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
484,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
485,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,1
486,-0.200002,-0.062592,-0.087289,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.075263,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0


## Saving the transformed data

In [25]:
df2.to_csv('attemps/tf2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned 2
    </h2>
</div>

## Make a copy of every column

In [26]:
dfCleanHeadlineText2 = dfCleanHeadlineText.copy()
dfCleanAuthor2 = dfCleanAuthor.copy()
dfCleanNamedEntitiesClaim2 = dfCleanNamedEntitiesClaim.copy()
dfCleanNamedEntitiesArticle2 = dfCleanNamedEntitiesArticle.copy()
dfCleanKeywords2 = dfCleanKeywords.copy()
dfCleanSource2 = dfCleanSource.copy()
dfCleanRatingName2 = dfCleanRatingName.copy()

## Transform data

In [27]:
dfCleanHeadlineText2 = pd.DataFrame(tfidfVectorizer1.fit_transform(dfCleanHeadlineText2).toarray(), 
                                    columns = tfidfVectorizer1.get_feature_names())
dfCleanHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanHeadlineText2), 
                                    columns = tfidfVectorizer1.get_feature_names())

dfCleanSource2 = pd.get_dummies(dfCleanSource2, columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanRatingName2), columns = ['RatingName'])

dfClean2 = pd.concat([dfCleanHeadlineText2, dfCleanSource2, dfCleanRatingName2], axis = 1)

display(dfClean2)

Unnamed: 0,05,1000,10000,100000,100k,101st,102000,106000,1270,150000,...,zimmerman,zipper,zippered,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0
1,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
2,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0
3,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,1
4,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,22.068076,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
484,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,0
485,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,1,0,0,1
486,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,-0.045314,...,-0.045314,-0.045314,-0.045314,-0.045314,0,0,0,1,0,0


## Saving the transformed data

In [28]:
dfClean2.to_csv('attemps/tfclean2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned upsampled 2
    </h2>
</div>

## Make a copy of every column

In [29]:
dfCleanUpsampleHeadlineText2 = dfCleanUpsampleHeadlineText.copy()
dfCleanUpsampleAuthor2 = dfCleanUpsampleAuthor.copy()
dfCleanUpsampleNamedEntitiesClaim2 = dfCleanUpsampleNamedEntitiesClaim.copy()
dfCleanUpsampleNamedEntitiesArticle2 = dfCleanUpsampleNamedEntitiesArticle.copy()
dfCleanUpsampleKeywords2 = dfCleanUpsampleKeywords.copy()
dfCleanUpsampleSource2 = dfCleanUpsampleSource.copy()
dfCleanUpsampleRatingName2 = dfCleanUpsampleRatingName.copy()

## Transform data

In [30]:
dfCleanUpsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer1.fit_transform(dfCleanUpsampleHeadlineText2).toarray(), 
                                            columns = tfidfVectorizer1.get_feature_names())
dfCleanUpsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanUpsampleHeadlineText2), 
                                            columns = tfidfVectorizer1.get_feature_names())

dfCleanUpsampleSource2 = pd.get_dummies(dfCleanUpsampleSource2, 
                                        columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanUpsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanUpsampleRatingName2), 
                                          columns = ['RatingName'])

dfCleanUpsample2 = pd.concat([dfCleanUpsampleHeadlineText2, dfCleanUpsampleSource2, 
                              dfCleanUpsampleRatingName2], axis = 1)

display(dfCleanUpsample2)

Unnamed: 0,05,1000,10000,100000,100k,101st,102000,106000,1270,150000,...,zimmerman,zipper,zippered,zuma,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,1,0,1
1,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,1,0,0
2,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,1,0,0
3,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,1,0,0,1
4,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,0,1,1
712,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,1,0,0,0
713,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,1,0,1
714,-0.037398,-0.037398,-0.074953,-0.037398,-0.037398,-0.037398,-0.037398,-0.074953,-0.037398,-0.052926,...,-0.037398,-0.037398,-0.037398,-0.037398,0,0,0,1,0,0


## Saving the transformed data

In [31]:
dfCleanUpsample2.to_csv('attemps/tfcleanupsample2.csv', sep = ';', index = False)

<div align="center">
    <h2>
        TF cleaned downsampled 2
    </h2>
</div>

## Make a copy of every column

In [32]:
dfCleanDownsampleHeadlineText2 = dfCleanDownsampleHeadlineText.copy()
dfCleanDownsampleAuthor2 = dfCleanDownsampleAuthor.copy()
dfCleanDownsampleNamedEntitiesClaim2 = dfCleanDownsampleNamedEntitiesClaim.copy()
dfCleanDownsampleNamedEntitiesArticle2 = dfCleanDownsampleNamedEntitiesArticle.copy()
dfCleanDownsampleKeywords2 = dfCleanDownsampleKeywords.copy()
dfCleanDownsampleSource2 = dfCleanDownsampleSource.copy()
dfCleanDownsampleRatingName2 = dfCleanDownsampleRatingName.copy()

## Transform data

In [33]:
dfCleanDownsampleHeadlineText2 = pd.DataFrame(tfidfVectorizer1.fit_transform(dfCleanDownsampleHeadlineText2).toarray(), 
                                              columns = tfidfVectorizer1.get_feature_names())
dfCleanDownsampleHeadlineText2 = pd.DataFrame(standardScaler.fit_transform(dfCleanDownsampleHeadlineText2), 
                                              columns = tfidfVectorizer1.get_feature_names())

dfCleanDownsampleSource2 = pd.get_dummies(dfCleanDownsampleSource2, 
                                          columns = ['Source'], prefix = 'Source').reset_index(drop = True)

dfCleanDownsampleRatingName2 = pd.DataFrame(classLabelEncoder.fit_transform(dfCleanDownsampleRatingName2), 
                                            columns = ['RatingName'])

dfCleanDownsample2 = pd.concat([dfCleanDownsampleHeadlineText2, dfCleanDownsampleSource2, 
                                dfCleanDownsampleRatingName2], axis = 1)

display(dfCleanDownsample2)

Unnamed: 0,05,10000,106000,150000,1729000000,17yearold,18000,1940s,200,20000,...,york,young,zambian,zimmerman,Source_africacheck,Source_factscan,Source_politifact,Source_snopes,Source_truthorfiction,RatingName
0,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,0
1,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,0,1,0,0
2,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,0,1,0,0
3,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,0,1,0,0
4,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,16.093477,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,1
256,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,1
257,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,1
258,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,-0.062137,...,-0.136522,-0.062137,-0.062137,-0.062137,0,0,1,0,0,1


## Saving the transformed data

In [34]:
dfCleanDownsample2.to_csv('attemps/tfcleandownsample2.csv', sep = ';', index = False)