In [9]:
import numpy as np
import pandas as pd 
import string
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package punkt to /Users/neeraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# Import the datasets 
train_data = pd.read_csv('../data/fnc-1/final_train.csv')
test_data = pd.read_csv('../data/fnc-1/final_test.csv')

In [11]:
# Clean the datasets 
def clean_data(text):
    # Remove whitespaces
    text = text.strip()
    # Remove special characters and numbers
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    # Remove punctuation 
    text.translate(str.maketrans('', '', string.punctuation))
    # Remove square brackets 
    text = re.sub('\[[^]]*\][.;:!\'?,\"()\[\]] ', '', text)
    # Remove break elements from the text 
    text = re.sub("(<br\s*/><br\s*/>)|(\-)|(\/)", '', text)
    # Converting the text to lowercase 
    text = text.lower()
    return text

# Cleaing training dataset columns
train_data['articleHeading'] = train_data['articleHeading'].apply(clean_data)
train_data['articleBody'] = train_data['articleBody'].apply(clean_data)
# Cleaning the test dataset colums
test_data['articleHeading'] = test_data['articleHeading'].apply(clean_data)
test_data['articleBody'] = test_data['articleBody'].apply(clean_data)

In [12]:
# Tokenize the words for training data
train_data['articleHeading'] = train_data['articleHeading'].apply(word_tokenize)
train_data['articleBody'] = train_data['articleBody'].apply(word_tokenize)
# Tokenize the words for testing data
test_data['articleHeading'] = test_data['articleHeading'].apply(word_tokenize)
test_data['articleBody'] = test_data['articleBody'].apply(word_tokenize)

In [13]:
# Method to remove stopwords from the dataframe
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    final_words = [word for word in text if word not in stop_words]
    return " ".join(final_words)

# Removing stopwords from article headings and article body for the train data
train_data['articleHeading'] = train_data['articleHeading'].apply(remove_stopwords)
train_data['articleBody'] = train_data['articleBody'].apply(remove_stopwords)
# Removing stopwords from article headings and article body for the test data
test_data['articleHeading'] = test_data['articleHeading'].apply(remove_stopwords)
test_data['articleBody'] = test_data['articleBody'].apply(remove_stopwords)

In [14]:
# Stemming to reduce words to their word stem for train data using 
# Porter Stemming or Lancaster Stemming algorithms.
def perform_stemming(text):
    # Using PorterStemmer 
    porter = PorterStemmer()
    porterFinal = [porter.stem(word) for word in text.split()]
    text = " ".join(porterFinal)
    # Using LancasterStemmer 
    # lancaster = LancasterStemmer()
    # lancasterFinal = [lancaster.stem(word) for word in text.split()]
    # text = " ".join(lancasterFinal)
    return text

# Stemming the train data by applying perform_stemming method
train_data['articleHeading'] = train_data['articleHeading'].apply(perform_stemming)
train_data['articleBody'] = train_data['articleBody'].apply(perform_stemming)
# Stemming the test data by applying perform_stemming method
test_data['articleHeading'] = test_data['articleHeading'].apply(perform_stemming)
test_data['articleBody'] = test_data['articleBody'].apply(perform_stemming)

In [15]:
# Lemmatization to reduce inflectional forms to a common base form
def perform_lemmatization(text):
    # Using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatizerFinal = [lemmatizer.lemmatize(word) for word in text.split()]
    text = " ".join(lemmatizerFinal)
    return text

# Lemmatizing the train data
train_data['articleHeading'] = train_data['articleHeading'].apply(perform_lemmatization)
train_data['articleBody'] = train_data['articleBody'].apply(perform_lemmatization)
# Lemmatizing the test data
test_data['articleHeading'] = test_data['articleHeading'].apply(perform_lemmatization)
test_data['articleBody'] = test_data['articleBody'].apply(perform_lemmatization)


In [16]:
# Save the new updated dataframes into csv files
train_data.to_csv('../data/fnc-1/preprocess_train.csv', index=False)  
test_data.to_csv('../data/fnc-1/preprocess_test.csv', index=False)