In [None]:
data_dir   = "../data/"
kaggle_dir = data_dir + "Kaggle/"
print(kaggle_dir)

In [None]:
import pandas as pd
import nltk
import string
import re
import numpy as np

In [None]:
kaggle_true_df = pd.read_csv(kaggle_dir + "True.csv.zip", compression = "zip")
kaggle_true_df.head()

In [None]:
kaggle_true_df.shape

In [None]:
kaggle_fake_df = pd.read_csv(kaggle_dir + "Fake.csv.zip", compression = "zip")
kaggle_fake_df.head()

In [None]:
kaggle_fake_df.shape

In [None]:
def remove_unencoded_text(text):
    """
    Removes characters that are not UTF-8 encodable.
    """
    return "".join([i if ord(i) < 128 else "" for i in text])

def is_allowed_word(word, stopwords, min_word_len):
    """
    Checks if word is allowed based on inclusion in stopwords and length.
    """
    stopwords_allowed = word not in stopwords
    length_allowed = len(word) >= min_word_len
    return stopwords_allowed and length_allowed

def preprocess(text, stopwords=set(nltk.corpus.stopwords.words("english")),
               stem=True, lemmatize=False, keep_alt_forms=False, min_word_len=1):
    '''
    Standardized preprocessing of a line of text.
    '''

    # remove non utf-8 characters
    text = remove_unencoded_text(text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # convert all whitespace to spaces for splitting
    whitespace_pattern = re.compile(r"\s+")
    text = re.sub(whitespace_pattern, " ", text)

    # lowercase the input
    text = text.lower()

    # split into words
    words = text.split(" ")

    # stem and/or lemmatize words
    # filtering stopwords, numbers, and word lengths as required
    stemmer = nltk.stem.porter.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    if stem and lemmatize:
        words = [
            [word, stemmer.stem(word), lemmatizer.lemmatize(word)]
            for word in words if is_allowed_word(
                word, stopwords, min_word_len)]
    elif stem:
        words = [
            [word, stemmer.stem(word)]
            for word in words if is_allowed_word(
                word, stopwords, min_word_len)]
    elif lemmatize:
        words = [
            [word, lemmatizer.lemmatize(word)]
            for word in words if is_allowed_word(
                word, stopwords, min_word_len)]
    else:
        words = [
            word for word in words if is_allowed_word(
                word, stopwords, min_word_len)]

    if stem or lemmatize:
        if keep_alt_forms:
            # return both original and stemmed/lemmatized words
            # as long as stems/lemmas are unique
            words = [w for word in words for w in set(word)]
        else:
            # return only requested stems/lemmas
            # if both stemming and lemmatizing, return only lemmas
            words = list(zip(*words))[-1]

    return " ".join(words)

In [None]:
labels = np.append(np.ones(len(kaggle_true_df)), np.zeros(len(kaggle_fake_df)), axis = 0) # 1 - true news, 0 - fake news
aggregate_df = kaggle_true_df.append(kaggle_fake_df)
preprocessed_text = []
for i in range(len(labels)):
    text_ = aggregate_df.iloc[i,:2].values
    try:
        title = preprocess(text_[0])
    except:
        continue ## No title
    try:
        body = preprocess(text_[1])
    except:
        continue ## No body
    preprocessed_text.append([title, body, labels[i]])

In [None]:
processed_dataframe = pd.DataFrame.from_records(preprocessed_text, columns = ["Title", "Body", "Label"])

In [None]:
processed_dataframe.to_csv(data_dir+"preprocessed_text_w_labels.csv")