# NLP for text classification

**Author**: Jonathan TRICARD

**Summary**: using a dataset propose by sklearn, we build a DNN model to predict in which category of topic the text belong. Then, we try to explain the choice of the model.

**ExplainDL**: create a file for each observation sected the given path.

## Import libraries

In [None]:
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from readml.logger import ROOT_DIR
from readml.explainers.dl.explain_dl import ExplainDL

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pandas as pd

## Initialize the directories

We need need to build the path to save the result.

In [None]:
def initialize_directories_dl(out_path, dir_to_create):
    os.chdir(ROOT_DIR)
    new_root = os.getcwd()
    new_root = "/".join(new_root.split("/")[:-1])
    os.chdir(new_root)
    start = out_path.index("/") + 1
    split = out_path[start:].split("/")
    for elt in split:
        if not os.path.isdir(elt):
            os.makedirs(elt)
            os.chdir(elt)
        else:
            os.chdir(elt)
    os.chdir(ROOT_DIR)

    for elt in dir_to_create:
        if not os.path.isdir(os.path.join(out_path, elt)):
            os.makedirs(os.path.join(out_path, elt))
            
def create_dir_test():
    dir_to_create = ["text"]
    out_path = "../outputs/notebooks/dl"
    initialize_directories_dl(out_path, dir_to_create)

create_dir_test()
output_path_text_dir = os.path.join(ROOT_DIR, "../outputs/notebooks/dl", "text")

## Import data

In [None]:
def create_text_data():
    categories = [
        'talk.religion.misc',
        'sci.space',
    ]
    remove = ('headers', 'footers', 'quotes')
    data_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42, remove=remove)
    data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42,remove=remove)
    vectorizer = CountVectorizer(max_features = 20000, stop_words='english')
    vectorizer.fit(data_train['data'])
    X_train = pd.DataFrame(vectorizer.transform(data_train['data']).todense(), columns=vectorizer.get_feature_names())
    X_test = pd.DataFrame(vectorizer.transform(data_test['data']).todense(), columns=vectorizer.get_feature_names())
    y_train, y_test = data_train['target'], data_test['target']
    df_train = pd.concat([X_train, pd.Series(y_train, name="target_col")], axis = 1)
    df_test = pd.concat([X_test, pd.Series(y_test, name="target_col")], axis = 1)
    return X_train, X_test, y_train, y_test, df_train, df_test, vectorizer

In [None]:
X_train, X_test, y_train, y_test, df_train, df_test, vectorizer = create_text_data()

## Train model

In [None]:
def build_model_dnn_text(shape, n_classes, dropout=0.5):
    model = Sequential()
    node = 512 # number of nodes
    n_layers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0, n_layers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = build_model_dnn_text(X_train.shape[1], n_classes = len(set(y_train)))
model.fit(X_train, y_train) 
#lr.score(X_test, y_test)

## Make intelligibility with readml

In [None]:
model = model
out_path = output_path_text_dir
test_data = df_test
target_col = "target_col"
word2idx = vectorizer.vocabulary_

In [None]:
exp = ExplainDL(
        model = model,
        out_path = out_path,
    )

In [None]:
df_test.target_col.head(5)

In [None]:
exp.explain_text(
    test_data = df_test.head(5),
    target_col = target_col,
    word2idx = word2idx,
)