In [1]:
##Fake News Detection in English and Spanish
##Pre-processing data sets to be easily processed later
import numpy as np
import pandas as pd
import re
import nltk
import simplemma

In [2]:
#English Fake News Dataset 
en_data = pd.read_csv('Original Datasets/English_Dataset_Fake News.csv')
en_data = en_data.loc[:, ['text', 'label']]
#Rename columns
en_data.columns = ['Text', 'Category']
#Map Category values to Fake and Real only 
en_data['Category'] = en_data['Category'].map({'FAKE': 'Fake', 'REAL': 'Real'})
#Drop rows where text and category are empty strings...there were non Nan values
for i in range(0, len(en_data)):
    if (en_data['Text'][i].isspace()): 
        en_data['Text'][i] = np.nan
    if (en_data['Category'][i].isspace()): 
        en_data['Category'][i] = np.nan      
en_data = en_data.dropna().reset_index(drop=True)
en_data.head()

Unnamed: 0,Text,Category
0,"Daniel Greenfield, a Shillman Journalism Fello...",Fake
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,Fake
2,U.S. Secretary of State John F. Kerry said Mon...,Real
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",Fake
4,It's primary day in New York and front-runners...,Real


In [3]:
#SPANISH Fake News Dataset 
spn_data = pd.read_excel('Original Datasets/Spanish_Dataset_Fake News.xlsx') 
spn_data = spn_data.loc[:, ['Text', 'Category']]
#Map Category values to Fake and Real only 
spn_data['Category'] = spn_data['Category'].map({'Fake': 'Fake', 'True': 'Real'})
spn_data.head()

Unnamed: 0,Text,Category
0,"RAE INCLUIRÁ LA PALABRA ""LADY"" EN EL DICCIONAR...",Fake
1,"La palabra ""haiga"", aceptada por la RAE La Rea...",Fake
2,YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L...,Fake
3,UNAM capacitará a maestros para aprobar prueba...,Real
4,Alerta: pretenden aprobar libros escolares con...,Fake


In [4]:
#English Processing w/ NLTK 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('words') #download list of English words
nltk.download('stopwords') #download list of English stopwords

lemmatizer = WordNetLemmatizer()
stopWords = stopwords.words('english')
enWords = set(nltk.corpus.words.words()) 

#lowercase and remove weird characters from text
def lower_char_process(text): 
    text = text.lower() 
    #Remove URLS 
    text = re.sub('https://.*\s?', '', text)
    #Remove any weird characters, contractions
    text = re.sub('[^a-zA-Z0-9 \n\.]', '', text)
    text = text.replace('.', '')
    return text 

#Map nltk part of speech tags to tags recognized by wordnet
#Because Lemmatizer is based on wordnet
def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatized(tokens): 
    nltk_tagged = nltk.pos_tag(tokens)
    lemmas = [] 
    for word, tag in nltk_tagged: 
        tag = nltk_pos_tagger(tag)
        if tag is None:
            lemmas.append(word)
        else: 
            lemmas.append(lemmatizer.lemmatize(word,tag))
    return lemmas
 
def stopWordRemoval(tokens): 
    stopWordRemoved = []
    for token in tokens: 
        if token in enWords and token not in stopWords: 
            stopWordRemoved.append(token)
    return stopWordRemoved

#Combine all steps to process text
def processText(df) : 
    textCol = []
    text = df['Text'][0]
    for i in range(0, len(df)):
        text = df['Text'][i]
        #Lowercase and remove weird characters
        text = lower_char_process(text)
        #Tokenization of data 
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        #Lemmatization of tokens
        text = lemmatized(text)
        #Remove stop words 
        text = stopWordRemoval(text)
        #Joining the processed text as a WHOLE string again
        text = ' '.join(text)
        textCol.append(text)
    return textCol

[nltk_data] Downloading package words to C:\Users\P
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\P
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Spanish Processing w/ NLTK and Simplemma
#NLTK does not provide spanish support for lemmatization...so had to use Simplemma
simpLang = simplemma.load_data('es')
stopWord_spn = stopwords.words('spanish')

def stopWordSpn(tokens): 
    stopWordRemoved = []
    for token in tokens: 
        #Remove newline characters and 'adj'/'f'
        removeChar = ['\n', 'adj', 'f']
        if token not in stopWord_spn and token not in removeChar: 
            stopWordRemoved.append(token)
    return stopWordRemoved    

def spanProcessText(df) : 
    textCol = []
    for i in range(0, len(df)):
        text = df['Text'][i]
        #Some more text processing in spanish bc of how data was...
        text = text.replace('*NUMBER*', '')
        text = re.sub('[^a-zA-Z0-9 \n\.ÁáéÉíÍñÑÓóÚÜúü]', '', text)
        #Same processing function used for English data
        text = lower_char_process(text)
        tokenizer = RegexpTokenizer(r'\w+')
        text = tokenizer.tokenize(text)
        #Stopword removal in Spanish
        text = stopWordSpn(text)
        #lemmatize....in spanish, verb+definite article can be used as one word so lemmatization breaks them up into root
        text = [simplemma.lemmatize(token, simpLang) for token in text]
        text = ' '.join(text)
        textCol.append(text)
    return textCol

In [6]:
#Store processed text with other dataset info  for easier accessing
def store_ProcessedData(origDataset, processedText, filename): 
    processedText = pd.Series(processedText, name = 'Text')
    origDataset['Text'] = processedText
    origDataset.to_csv(filename,  index=False)

In [7]:
#Processing and storing English Dataset
processedText_EN = processText(en_data)
store_ProcessedData(en_data, processedText_EN, '../2. Model Fit, Train, Test/Processed Text-Dataset/Processed_EN_Data.csv')

In [8]:
#Processing and storing Spanish Dataset
processedText_SPN = spanProcessText(spn_data)
store_ProcessedData(spn_data, processedText_SPN, '../2. Model Fit, Train, Test/Processed Text-Dataset/Processed_Spn_Data.csv')