In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import joblib

import spacy
import contractions
import string
import re
from bs4 import BeautifulSoup
from unidecode import unidecode
import nltk
import swifter
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from transformers import AutoTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
news_df = pd.read_csv('WELFake_Dataset.csv')

In [3]:
news_df.head(20)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
6,6,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1
7,7,HOUSE INTEL CHAIR On Trump-Russia Fake Story: ...,,1
8,8,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1
9,9,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1


In [4]:
news_df.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [5]:
news_df = news_df.dropna(subset='text')

In [6]:
news_df.isnull().sum()

Unnamed: 0      0
title         558
text            0
label           0
dtype: int64

In [7]:
news_df.fillna('', inplace=True)

In [8]:
#Remove rows with blank space instead of text
blank_rows = news_df[news_df['text'].str.strip() == ''].index
news_df = news_df.drop(blank_rows)
news_df = news_df.reset_index(drop=True)

In [9]:
no_header = []
for i,val in enumerate(news_df['text'].values):
    try:
        temp = val.split(" -", maxsplit=1)
        temp[1] # When no text, gives error
        assert(len(temp[0]) < 260) # Only removes headers at the start
    except:
        no_header.append(i)

In [10]:
txt_temp = []

for i,val in enumerate(news_df['text'].values):
    if i in no_header:
        txt_temp.append(val)
        continue
        
    temp = val.split(" -", maxsplit=1)
    
    txt_temp.append(temp[1]) # Το κανουμε ετσι επειδη αν δεν κανει split πεταει error και δεν κανει
    # append κενη τιμη. Επομένως κανουμε split μονο στις γραμμες που εχουν Header

In [11]:
news_df['text'] = txt_temp

In [12]:
news_df.head(20)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
6,6,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1
7,8,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1
8,9,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1
9,10,GOP Senator Just Smacked Down The Most Puncha...,The most punchable Alt-Right Nazi on the inter...,1


In [13]:
news_df['text'] = news_df['title'] + news_df['text']
news_df.drop(['title', 'Unnamed: 0'],axis=1, inplace=True)

In [14]:
#Unicode to ascii if exists
def replace_foreign_chars(text):
    return unidecode(text)

news_df['text'] = news_df['text'].apply(replace_foreign_chars)

#Constractions handling
def expand_contractions(text):
    return contractions.fix(text)

news_df['expanded_text'] = news_df['text'].apply(expand_contractions)
news_df.drop('text', axis=1, inplace=True)
news_df.rename(columns={'expanded_text': 'text'}, inplace=True)

#Convert text to lowercase text
def lowercase_text(text):
    return " ".join(x.lower() for x in text.split() )

news_df['text'] = news_df['text'].apply(lowercase_text)

#Remove URLs and special Characters
def remove_urls(text):
    cleaned_text = re.sub(r'http\S+', '', text)
    return cleaned_text

def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

news_df['text'] = news_df['text'].apply(remove_urls)
news_df['text'] = news_df['text'].apply(remove_special_characters)

#Remove html tags func
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

news_df['text'] = news_df['text'].apply(remove_html_tags)
news_df

#Delete numbers from dataset
news_df["text"] = news_df["text"].str.replace('\d','')

#Punctuations hadnling
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation

news_df['text'] = news_df['text'].apply(remove_punctuation)

#Remove Stopwords and Punctuations
def remove_stopwords_punctuation(text):
    stop_words = set(stopwords.words("english"))
    punctuation = list(string.punctuation)
    stop_words.update(punctuation)
    filtered_text = " ".join(x for x in text.split() if x not in stop_words)
    return filtered_text

news_df["text"] = news_df["text"].apply(remove_stopwords_punctuation)



# To save preprocessed lemmatized dataset
lemmatizer = WordNetLemmatizer()

def lemmatize_word(word):
    return lemmatizer.lemmatize(word)

def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatize_word(word) for word in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

In [15]:
news_df['text'] = news_df['text'].swifter.apply(lemmatize_text)
news_df.to_csv('new_WELFake_preprocessed_lemmatized_dataset.csv', index=False)


Pandas Apply:   0%|          | 0/71351 [00:00<?, ?it/s]