In [1]:
import re
import string
import time
from typing import Set, List

import warnings
from abc import ABC, abstractmethod
from enum import Enum

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

warnings.filterwarnings("ignore")

#### PART 1:  Cleaner functionality
The following methods are designed to cleanup different parts of a given input text. The `clean_text`-method combines all the necessasy mehtods to cleanup the text as much as possible. 

In [2]:

def get_all_punctuations() -> Set[str]:
    stop = set(stopwords.words("english"))
    all_punctuations = list(string.punctuation)
    stop.update(all_punctuations)
    return stop


def strip_html(text: str) -> str:
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def remove_urls(text: str) -> str:
    return re.sub(r'http\S+', '', text)


def remove_between_square_brackets(text: str) -> str:
    return re.sub('\[[^]]*\]', '', text)


def remove_numbers(text: str) -> str:
    return re.sub('\w*\d\w*', '', text)


def remove_stopwords(text: str) -> str:
    _punctuations = ['\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '/', '“', '”', '’', '‘', '>', '@', '#', '+',
                     '-', '--', '?', '%', '#', '£', '.', ':', ';', ',', '!', '$', '\'']
    stops = get_all_punctuations()
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stops:
            for ch in _punctuations:
                if ch in i:
                    i = i.replace(ch, " ")
            final_text.append(i.strip())
    return " ".join(final_text)



def remove_accent_signs(text: str) -> str:
    return re.sub('[‘’“”…]', '', text)


def remove_new_lines(text: str) -> str:
    return re.sub('\n', '', text)


def tokenize(text: str) -> List[str]:
    split = re.split("\W+", text)
    return split


def lemmatizer(text: List[str]) -> List[str]:
    wl = WordNetLemmatizer()
    return [wl.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]


def clean_text(text: str) -> str:
    text = strip_html(text)
    text = text.lower()
    text = remove_between_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)
    text = remove_numbers(text)
    text = remove_accent_signs(text)
    text = remove_new_lines(text)
    text = tokenize(text)
    text = lemmatizer(text)
    return ' '.join(i for i in text)


#### PART 2: Train test split
The class below encapsulates and keeps track of train and test parts of the text and categories. We split the data 80/20 between train and test model, where 80% goes to train the model and use the remaining 20% to test the trained models.

In [3]:
class TrainTestSplit:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        text = self.df.text
        category = self.df.category
        self.text_train, self.text_test, self.category_train, self.category_test = train_test_split(text,
                                                                                                    category,
                                                                                                    train_size=.8,
                                                                                                    stratify=category,
                                                                                                    random_state=19)

#### PART 3: Vectorization
 In this project, two different vectorization techniques are utilized. Therefore, an enum/enumeration-class is constructed to keep track of the type of vectorization. In addition, a `Vectorize`-class is created to build an object of the given vectorizer type, which is used to convert train data into numerical numbers representing the frequency of each token.

In [4]:
class VectorizerType(Enum):
    Tfidf = 0
    Count = 1


class Vectorize:
    def __init__(self, split: TrainTestSplit, vectorizer_type: VectorizerType):
        self.split = split

        if vectorizer_type == VectorizerType.Tfidf:
            self._vectorizer = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1, 3))
        elif vectorizer_type is VectorizerType.Count:
            self._vectorizer = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1, 3))
        else:
            raise RuntimeError("Unable to determine the vectorizer type.")

        self.train = self._vectorizer.fit_transform(self.split.text_train)
        self.test = self._vectorizer.transform(self.split.text_test)

        msg = "TF-IDF" if vectorizer_type == VectorizerType.Tfidf else "Count"
        print(f"{msg} Train: {self.train.shape}")
        print(f"{msg} Test: {self.test.shape}")

#### PART 4: Text processor
Given the input datasets, the class below starts the orchestriation of all the necassery objects, such as the dataframe, vectorizer, and train-test-split. In addition, it also handles the cleanup of the dataframe. 

In [5]:
class TextProcessor:
    def __init__(self, true_news: str, fake_news: str, vectorizer_type: VectorizerType, sample_size: int = 1000):
        self._df = None
        self.true_news = pd.read_csv(true_news)
        self.fake_news = pd.read_csv(fake_news)
        self.vectorizer_type = vectorizer_type
        self.sample_size = sample_size

        self._split = None
        self._vectorize = None

    def create_data_frame(self) -> pd.DataFrame:
        self.true_news["category"] = 1
        self.fake_news["category"] = 0

        df = pd.concat([self.true_news, self.fake_news], axis=0)
        df = df.drop(["title", "subject", "date"], axis=1)

        df.reset_index(inplace=True)
        df.drop(["index"], axis=1, inplace=True)

        return df.sample(frac=1).head(self.sample_size)

    @property
    def df(self) -> pd.DataFrame:
        if self._df is None:
            self._df = self.create_data_frame()
        return self._df

    def cleanup_text(self) -> None:
        self.df['text'] = self.df['text'].apply(clean_text)

    def pre_processor(self) -> None:
        self.create_data_frame()
        self.cleanup_text()

    @property
    def split(self) -> TrainTestSplit:
        if self._split is None:
            self._split = TrainTestSplit(self.df)
        return self._split

    @property
    def vectorize(self):
        if self._vectorize is None:
            self._vectorize = Vectorize(split=self.split, vectorizer_type=self.vectorizer_type)
        return self._vectorize

#### PART 5: Model
`ModelType`: an enumuration class containing all the supported models in this project.

The `create_model`-method is a helper method which is used to instantiate a model object based on the input model type. So e.g., if the model type is `ModelType.nb`, then it creates and returns an object of `MultinomialNB` class.

The `Model`-class inherets all the functionality of `TextProcessor`-class. Meaning it is able to cleanup the input text, split the input into traning and testing parts, as well as, vectorizing the text. The `Model`-class uses the `create_model` to get the appropriate object representing the requested input model type. Once the model object is constructed, it's capable of performing a traning on the dataset and predict based on the test data input. The `Model`-class has also the funtionality to measure the accuracy score, as well as generating a classification report.

In [6]:
class ModelType(Enum):
    nb = "MultinomialNBModel"
    lr = "LogisticRegressionModel"
    pa = "PassiveAggressiveClassifier"
    dt = "DecisionTreeClassifier"


def create_model(model_type: ModelType):
    if model_type == ModelType.nb:
        return MultinomialNB()
    elif model_type == ModelType.lr:
        return LogisticRegression()
    elif model_type == ModelType.pa:
        return PassiveAggressiveClassifier(C=0.5, random_state=5)
    elif model_type == ModelType.dt:
        return DecisionTreeClassifier()


class Model(TextProcessor):
    def __init__(self, true_news: str, fake_news: str, vectorizer_type: VectorizerType, model_type: ModelType,
                 sample_size: int = 1000):
        super().__init__(true_news, fake_news, vectorizer_type, sample_size)
        self.model_type = model_type
        self.model = create_model(self.model_type)

        self._score = None

    def fit(self):
        self.model.fit(self.vectorize.train, self.split.category_train)

    def predict(self):
        return self.model.predict(self.vectorize.test)

    @property
    def accuracy_score(self):
        if self._score is None:
            self._score = accuracy_score(self.split.category_test, self.predict())
        return self._score
    
    def print_accuracy_score(self) -> None:
        msg = "TF-IDF" if self.vectorizer_type == VectorizerType.Tfidf else "Count"
        print(f"{self.model_type.value} {msg} accuracy score:  {round(self.accuracy_score * 100, 2)}%")

    def print_classification_report(self) -> None:
        report = classification_report(self.split.category_test, self.predict(), target_names=['0', '1'])
        print(report)

    def run(self):
        self.pre_processor()  # cleans the text
        self.fit() # training the model

        self.print_accuracy_score()
        self.print_classification_report()


#### PART 6: Orchestration and running the model
Below, we have a helper method, which takes input fake and true news files, as well as a sample size and an input model type. The method is used to run the specified model.

In [7]:
def run_model(true_news: str, fake_news: str, sample_size: int, model_type: ModelType):
    vectorizer_types = [VectorizerType.Tfidf, VectorizerType.Count]
    for vectorizer_type in vectorizer_types:
        tic = time.perf_counter()
        m = Model(true_news=true_news, fake_news=fake_news, vectorizer_type=vectorizer_type,
                  model_type=model_type, sample_size=sample_size)
        m.run()
        print(f"Execution time for {float(time.perf_counter()-tic):0.2f}")
        print()
        print("------------------------------------------------------")

In [8]:
# Path to input files:
true_news_file = 'True.csv'
fake_news_file = 'Fake.csv'

#### MultinominalNBModel
Runs the Naïve Bayse model

In [9]:
sample_size = 100
model_type = ModelType.nb

run_model(true_news=true_news_file, fake_news=fake_news_file, sample_size=sample_size, model_type=model_type)

TF-IDF Train: (80, 36907)
TF-IDF Test: (20, 36907)
MultinomialNBModel TF-IDF accuracy score:  55.0%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.55      1.00      0.71        11

    accuracy                           0.55        20
   macro avg       0.28      0.50      0.35        20
weighted avg       0.30      0.55      0.39        20

Execution time for 10.41

------------------------------------------------------
Count Train: (80, 37420)
Count Test: (20, 37420)
MultinomialNBModel Count accuracy score:  70.0%
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         9
           1       0.73      0.73      0.73        11

    accuracy                           0.70        20
   macro avg       0.70      0.70      0.70        20
weighted avg       0.70      0.70      0.70        20

Execution time for 7.76

-------------------------------------------

#### Logistic regression Model
Runs the Logistic regression model

In [10]:
sample_size = 100
model_type = ModelType.lr

run_model(true_news=true_news_file, fake_news=fake_news_file, sample_size=sample_size, model_type=model_type)

TF-IDF Train: (80, 34391)
TF-IDF Test: (20, 34391)
LogisticRegressionModel TF-IDF accuracy score:  55.0%
              precision    recall  f1-score   support

           0       0.55      1.00      0.71        11
           1       0.00      0.00      0.00         9

    accuracy                           0.55        20
   macro avg       0.28      0.50      0.35        20
weighted avg       0.30      0.55      0.39        20

Execution time for 7.45

------------------------------------------------------
Count Train: (80, 36093)
Count Test: (20, 36093)
LogisticRegressionModel Count accuracy score:  55.0%
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        10
           1       1.00      0.10      0.18        10

    accuracy                           0.55        20
   macro avg       0.76      0.55      0.44        20
weighted avg       0.76      0.55      0.44        20

Execution time for 8.29

----------------------------------

#### Passive Aggressive Classifier Model
Runs the PassiveAggressiveClassifier model

In [11]:
sample_size = 100
model_type = ModelType.pa

run_model(true_news=true_news_file, fake_news=fake_news_file, sample_size=sample_size, model_type=model_type)

TF-IDF Train: (80, 35623)
TF-IDF Test: (20, 35623)
PassiveAggressiveClassifier TF-IDF accuracy score:  65.0%
              precision    recall  f1-score   support

           0       0.65      0.92      0.76        12
           1       0.67      0.25      0.36         8

    accuracy                           0.65        20
   macro avg       0.66      0.58      0.56        20
weighted avg       0.65      0.65      0.60        20

Execution time for 8.40

------------------------------------------------------
Count Train: (80, 41981)
Count Test: (20, 41981)
PassiveAggressiveClassifier Count accuracy score:  75.0%
              precision    recall  f1-score   support

           0       0.75      0.82      0.78        11
           1       0.75      0.67      0.71         9

    accuracy                           0.75        20
   macro avg       0.75      0.74      0.74        20
weighted avg       0.75      0.75      0.75        20

Execution time for 9.40

--------------------------

#### Decision Tree Classifier Model
Runs the DecisionTreeClassifier model

In [12]:
sample_size = 100
model_type = ModelType.dt

run_model(true_news=true_news_file, fake_news=fake_news_file, sample_size=sample_size, model_type=model_type)

TF-IDF Train: (80, 34937)
TF-IDF Test: (20, 34937)
DecisionTreeClassifier TF-IDF accuracy score:  60.0%
              precision    recall  f1-score   support

           0       0.58      1.00      0.73        11
           1       1.00      0.11      0.20         9

    accuracy                           0.60        20
   macro avg       0.79      0.56      0.47        20
weighted avg       0.77      0.60      0.49        20

Execution time for 7.84

------------------------------------------------------
Count Train: (80, 42207)
Count Test: (20, 42207)
DecisionTreeClassifier Count accuracy score:  55.0%
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        10
           1       1.00      0.10      0.18        10

    accuracy                           0.55        20
   macro avg       0.76      0.55      0.44        20
weighted avg       0.76      0.55      0.44        20

Execution time for 8.58

------------------------------------