In [None]:
import pandas as pd
import numpy as np

import json

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_lg

nlp = en_core_web_lg.load()

In [None]:
import re
import string

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import seaborn as sns
sns.set()
from IPython.core.pylabtools import figsize
figsize(20, 20)

In [None]:
# Just disable some annoying warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# Read and parse dataset

In [None]:
df = pd.read_csv('../input/islamic-news/Data artikel islam true xls.csv')

# Dataset investigation

## Samples Count

In [None]:
df.count()

In [None]:
df.kategori.value_counts().plot.barh()

# Dataset preprocessing and feature extraction

## Text preprocessing 

In order to extract features from raw text, we have to somehow preprocess it. Typical simplified text preprocessing workflow:

<img src="https://i.ibb.co/Prrpgxg/nlp-preproc.png" />

1. **Tokenization** - inteligent splitting text into some kind of tokens (sentences, words, etc.)
2. **Cleaning** - cleaning text from all undesirable symbols or tokens or lines or whatever, it could be for example stop words or punctuation. 
3. **Steaming** - rocess of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language. [refference](https://www.datacamp.com/community/tutorials/stemming-lemmatization-python)
4. **Lemmatization** - unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called Lemma. A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words. [refference](https://www.datacamp.com/community/tutorials/stemming-lemmatization-python)
5. **Feature extraction** - vectorization of the ouput stems/lemmas. For example counting for each document or using TF-IDF.

In our case I have chosen such workflow:
1. **Tokenization** - using [spaCy](https://spacy.io/) python library
2. **Cleaning** - after investigating the content of raw emails, I have decided the next steps:
    1. Delete symbols ```!"#%&\'*+,-<=>?[\\]^_`{|}~```
    2. Delete lines, which begins with `From:` or end with `writes:`, autogenerated content by email host.
    3. Delete `email strings`, `From:`, `Re:`, `Subject:`
    4. Delete numbers
3. **Lemmatization** - I have chosen lemmatization(`spaCy`) in favor of steaming(`nltk`), because it has shown ability to generate more robust results. I've used [spaCy](https://spacy.io/) python library. 
4. **Feature extraction** - TF-IDF

In [None]:
import re
import string

from sklearn.base import TransformerMixin

class TextPreprocessor(TransformerMixin):
    def __init__(self, text_attribute):
        self.text_attribute = text_attribute
        
    def transform(self, X, *_):
        X_copy = X.copy()
        X_copy[self.text_attribute] = X_copy[self.text_attribute].apply(self._preprocess_text)
        return X_copy
    
    def _preprocess_text(self, text):
        return self._lemmatize(self._leave_letters_only(self._clean(text)))
    
    def _clean(self, text):
        bad_symbols = '!"#%&\'*+,-<=>?[\\]^_`{|}~'
        text_without_symbols = text.translate(str.maketrans('', '', bad_symbols))

        text_without_bad_words = ''
        for line in text_without_symbols.split('\n'):
            if not line.lower().startswith('from:') and not line.lower().endswith('writes:'):
                text_without_bad_words += line + '\n'

        clean_text = text_without_bad_words
        email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
        regexes_to_remove = [email_regex, r'Subject:', r'Re:']
        for r in regexes_to_remove:
            clean_text = re.sub(r, '', clean_text)

        return clean_text
    
    def _leave_letters_only(self, text):
        text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))
        return ' '.join(re.findall("[a-zA-Z]+", text_without_punctuation))
    
    def _lemmatize(self, text):
        doc = nlp(text)
        words = [x.lemma_ for x in [y for y in doc if not y.is_stop and y.pos_ != 'PUNCT' 
                                    and y.pos_ != 'PART' and y.pos_ != 'X']]
        return ' '.join(words)
    
    def fit(self, *_):
        return self

In [None]:
text_preprocessor = TextPreprocessor(text_attribute='artikel')
df_preprocessed = text_preprocessor.transform(df)

### Feature extraction

To train TF-IDF vectorizer we have to split our dataset into `train`/`test` parts, I have chosen typical `70`/`30` ratio.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_preprocessed, test_size=0.3)

To make **TF-IDF** vectorizer efficient, we have to specify rather large vocabulary(`max_features`). It causes very very large dataset dimensionality. Usually this causes RAM issues or computational time issues on the model training step. Therefore, `sklearn.feature_extraction.text.TfidfVectorizer` returns sparse matrix as the output, which is much more memmory efficient and computational time efficient for some classifier models, which are able to deal with sparse matrices. But some of further preprocessing steps, are not able to deal with this, so the next steps will be very memory-consuming. Because of that I have chosen only `10 000` words vocabulary.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X_tfidf_train = tfidf_vectorizer.fit_transform(train['artikel'])
X_tfidf_test = tfidf_vectorizer.transform(test['artikel'])

print(X_tfidf_train.shape)

In [None]:
y = train['kategori']
y_test = test['kategori']

print({}, y.shape)
print({}, y_test.shape)

X = X_tfidf_train
x_test = X_tfidf_test

# Model training and evaluation

I've tried more than 10 different model instances, with a great batch of different hyperparameters. The experimets were held using `sklearn.model_selection.GridSearchCV` with cross validation size 5. These experiments run more than 2 days non stop on my PC, and here I want to show condensed results of 5 models, their evaluation and result selection

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [None]:
# this snippet was taken from https://gist.github.com/shaypal5/94c53d765083101efc0240d776a23823

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def print_confusion_matrix(confusion_matrix, 
                           class_names, 
                           figsize = (15,15), 
                           fontsize=12,
                           ylabel='True label',
                           xlabel='Predicted label'):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)

In [None]:
def evaluate_model(model, X, y, X_test, y_test, target_names=None):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    scores_test = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')
    
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    print("Accuracy test: %0.2f (+/- %0.2f)" % (scores_test.mean(), scores_test.std()))
    
    print("Test classification report: ")
    if target_names is None:
        target_names = model.classes_
    print(classification_report(y_test, model.predict(X_test), target_names=target_names))
    print("Test confusion matrix: ")
    print_confusion_matrix(confusion_matrix(y_test, model.predict(X_test)), class_names=target_names)

### Multinominal Naive Bayes

For `MultinomialNB` we see that the most misclassified groups are `alt.atheism`, `talk.politics.misc` and `talk.religion.misc`. A lot of `talk.religion.misc` were classified into very close group `soc.religion.christian`. And probably in this dataset contains mostly republican mails, if so many `talk.politics.misc` were classified in `talk.politics.guns`.

In [None]:
mb = MultinomialNB()
mb.fit(X_selected, y_resampled)
evaluate_model(mb, X_selected, y, X_test_selected, y_test)