<a href="https://colab.research.google.com/github/mkaramib/MachineLearning/blob/main/QuestionClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Question Classification
In this notebook, I will implement a question classifier using Trax deep learning framework. 

In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from unicodedata import normalize
import re
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier

# Download required packages.

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Import
In this section, all the training and testing questions are read.


*   Train Data: contains 1000/2000/3000 questions in each file.
*   Test Data: contains close to 500 questions to evaluate the trained model.

# Question format
In each file(train and test), each line contains a question which has the following format:


> QuestionCategory: Question content.






# Tokenize function

In [3]:
def tokenize(question):
    """
    separate the question type as well as question tokens
    :param question:
    :return:
    """
    # index of colon
    colon = question.find(':')

    # get question type
    q_cat = question[0:colon]
    content_normalized = normalize('NFKC', question[colon:])
    content_normalized = re.sub("[^a-zA-Z. ]", "", content_normalized)
    terms_all = word_tokenize(content_normalized)

    # remove the stop words
    terms = [w for w in terms_all if not w in stop_words]
    #terms = terms_all
    return q_cat, terms

# Load and preprocess questions

In [4]:
def read_questions(file):
    """
    read questions from file and tokenize them
    :param file:
    :return: list of (question category, question tokens)
    """
    f = open(file, 'r', encoding="ISO-8859-1")
    lines = f.readlines()

    questions = []
    for line in lines:
        cat, terms = tokenize(line)
        questions.append((cat, terms))

    return questions

# Build TF Training matrix

In [5]:
def build_tfidf_train_matrix(questions, stopword=False):
    """
    build the tf.idf training matrix 
    :param questions: 
    :param stopword: use stop word or not
    :return: X:2D array , Y: output
    """
    cats = [cat for (cat, q) in questions]
    unique_cats = list(set(cats))
    f_set = set()
    m = len(cats)   # length of questions

    # generate the feature sets
    for (cat, terms) in questions:
        f_set.update(terms)

    # convert set to list to have indexes
    f_set = list(f_set)

    # build the Y
    Y = [unique_cats.index(cat) for cat in cats]

    # build the X
    X = np.zeros((m, len(f_set)))
    for i in range(m):
        terms = questions[i][1]
        for t in terms:
            X[i, f_set.index(t)] += 1

    # calc the DF(document frequency) of terms
    DF = np.count_nonzero(X, axis=0)
    IDF = np.log((m+1)/(DF+1)) + 1

    # update the X, multiply tf into IDF
    for i in range(len(f_set)):
        X[:,i] = X[:,i] * IDF[i]

    return X, Y, f_set, unique_cats

# Build TF test matrix

In [6]:
def build_tfidf_test_matrix(questions, features, cats, stopword=False):
    """
    build the tf.idf training matrix
    :param questions:
    :param stopword: use stop word or not
    :return: X:2D array , Y: output
    """
    # len of test samples
    m = len(questions)

    # build the Y
    Y = []

    # build the X
    X = np.zeros((m, len(features)))
    for i in range(m):
        Y.append(cats.index(questions[i][0]))
        terms = questions[i][1]
        for t in terms:
            if t in features:
                X[i, features.index(t)] += 1

    # calc the DF(document frequency) of terms
    DF = np.count_nonzero(X, axis=0)
    IDF = np.log((m+1)/(DF+1)) + 1

    # update the X, multiply tf into IDF
    for i in range(len(features)):
        X[:,i] = X[:,i] * IDF[i]

    return X, Y

# Scikit-learn Classifier functions

In [7]:
def mlp(X_train, Y_train, X_test, Y_test, d_input, d_output):
    NN = MLPClassifier(solver='lbfgs', alpha=0.01, hidden_layer_sizes=(d_input, d_output), random_state=1)
    NN.fit(X_train, Y_train)
    print(f"Sklearn Neural Network - train performance = {round(NN.score(X_train, Y_train), 4)}")
    print(f"Sklearn Neural Network - test performance = {round(NN.score(X_test, Y_test), 4)}")


def logistic_regression(X_train, Y_train, X_test, Y_test):
    # run logistic regression
    LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, Y_train)
    print(f"Logistic Regression - train performance = {round(LR.score(X_train, Y_train), 4)}")
    print(f"Logistic Regression - test performance = {round(LR.score(X_test, Y_test), 4)}")


def svm(X_train, Y_train, X_test, Y_test):
    # run SVM
    #SVM = LinearSVC()
    SVM = SVC(decision_function_shape='ovo')
    SVM.fit(X_train, Y_train)
    print(f"SVM - train performance = {round(SVM.score(X_train, Y_train), 4)}")
    print(f"SVM - test performance = {round(SVM.score(X_test, Y_test), 4)}")


def random_forest(X_train, Y_train, X_test, Y_test):
    # run random forest
    RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    RF.fit(X_train, Y_train)
    print(f"Random Forest - train performance = {round(RF.score(X_train, Y_train), 4)}")
    print(f"Random Forest - test performance = {round(RF.score(X_test, Y_test), 4)}")

# Run the Scikit_learn classifiers

In [8]:
def run_sklearns(train, test):
    X_train, Y_train, features, cats = build_tfidf_train_matrix(train)
    X_test, Y_test = build_tfidf_test_matrix(test, features,cats,False)

    # run logistic regression
    #logistic_regression(X_train, Y_train, X_test, Y_test)

    # run svm
    #svm(X_train, Y_train, X_test, Y_test)

    # run random forest
    #random_forest(X_train, Y_train, X_test, Y_test)

    # run mlp
    mlp(X_train, Y_train, X_test, Y_test, len(features), len(cats))

## Main part to load, train, and run classifiers

In [None]:
# read the questions
train_qs = read_questions("./questions/train_5500.label")
test_qs = read_questions("./questions/TREC_10.label")

# run the MlP classifier
run_sklearns(train_qs, test_qs)