In [1]:
# Sklearn module
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from bs4 import BeautifulSoup, Tag    ## Cleaning HTML tags from text
import pandas as pd
import xgboost 
import numpy as np
#import textblob
#import string
import matplotlib.pyplot as plt
import seaborn as sns
import json     ## To covnert json raw data to df
import pickle   ## saving the model to disk

#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

pd.set_option('display.max_colwidth', -1)    
## Problem texts can be long and may not load on Jupyter

## DATA PREPARATION

In [2]:
# Load the dataset and show sample question
with open('../data/qs_topicwise.json') as json_data:
    all_questions = json.load(json_data)

all_questions[3]

{'subject': 'MTH',
 'grade': '12',
 'curriculum': 'JEE',
 'chapter': 'Inverse Trigonometry ',
 'chapter_no': '18',
 'topic': 'Introduction to Inverse Trigonometry',
 'topic_no': '01',
 'difficulty': '1',
 'problem_code': 'P005930',
 'problem_status': 'final',
 'problem_mongo_id': '56f235d43562d97499000848',
 'problem_type': 'Spot Test',
 'options': ' \\(\\left[0, 2\\pi\\right]\\) \\(\\left[-\\frac{\\pi}2, \\frac{\\pi}2\\right]\\) \\(\\left[0, \\pi\\right]\\) \\(\\left[0, \\frac{\\pi}2\\right]\\)',
 'solution': '',
 'question_text': '\xa0The principal domain of \\(\\tan\u2061𝑥\\) is ___________\xa0'}

In [3]:
## Create an empty dataframe
#data_df = pd.DataFrame(columns=['curriculum', 'subject', 'question_text', 'chapter'])
data_df = pd.DataFrame(columns=['problem_code','curriculum', 'subject', 'question_text', 'chapter'])

data_df.head()

Unnamed: 0,problem_code,curriculum,subject,question_text,chapter


In [6]:
## Convert JSON to dataframe

#subject_model = ['PHY']
subject_model = ['PHY', 'CHM', 'MTH']

## Removing these words improves Phy clf accuracy by 2% but improves Math classifier accuracy
words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", 
                   "value", "amp", "statement", "will", "equal", "number", 
                   "tan", "now", "can", "two", "get", "true", "lambda"]

## Remove dummy and empty chapters
chapters_to_remove = ['Selection Test', 'Repository', 'Bridge Intervention Curriculum', 
                      'M1.1 Scaffold test', 'Tally Marks', 'Dummy']
chapters_with_no_data = ['Static Electricity', 'Experimental Skills', 'Nuclear Chemistry', 
                         'Principle of Mathematical Induction', 'Environmental Chemistry']

i = 0
for question in all_questions:
    try: 
        question_text = question['question_text'].lower()
        ## Remove HTML tags from text
        '''
        Beautiful Soup improves accuracy from 40% to 60% in MTH, 
        but reduces PHY accuracy from 60% to 20%. For CHM, 
        it reduces accuracy from 49% to 47% 
        '''
        question_text = BeautifulSoup(question_text, "html.parser").get_text() 
        ## Remove stop words ?
        #question_text = " ".join([word for word in question_text.split() if word not in words_to_remove])
        ## Remove extra whitespaces
        question_text = " ".join(question_text.split()) 
        ## Extracting elements for cleaner code
        grade = question['grade']
        curriculum = question['curriculum']
        chapter = question['chapter']
        subject = question['subject']
        ## Filtering each question from JSON to add relevant Qs to dataframe 
        if(    grade in ["11", "12"] and
               "JEE" in curriculum and
               chapter not in chapters_to_remove and
               #chapter not in chapters_with_no_data and
               "dummy" not in question_text and 
               subject in subject_model):
                #data_df.loc[i] = [curriculum, subject, question_text, chapter]
                data_df.loc[i] = [question['problem_code'], curriculum, subject, question_text, chapter]
                i += 1
    except:
            pass

trainDF = pd.DataFrame(columns=['text', 'label'])
trainDF['text'] = data_df['question_text']
trainDF['label'] = data_df['chapter']
trainDF.head(3)

Unnamed: 0,text,label
0,"among the statements given below, which one is correct?",Inverse Trigonometry
1,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________,Inverse Trigonometry
2,the principal domain of \(\cos⁡𝑥\) is ___________,Inverse Trigonometry


### Distribution of labels

In [None]:
trainDF.label.value_counts().plot(kind="bar")

## Split into training & validation sets

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

print(len(train_x), len(valid_x) )

## Label encode the target variable (for multi label classification)

In [None]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
encoder.classes_

In [None]:
train_y[0]

In [None]:
valid_y = encoder.transform(valid_y)
valid_y[0]

In [None]:
encoder.inverse_transform([valid_y[0]])

## FEATURE ENGINEERING

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtrain_count[xtrain_count != 0] = 1
print(xtrain_count)

xvalid_count =  count_vect.transform(valid_x)
xvalid_count[xvalid_count != 0] = 1

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

## MODEL BUILDING AND PREDICTION

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    #return metrics.accuracy_score(predictions, valid_y)  ## Not a good metric for imbalanced multi-class clf
    return metrics.f1_score(predictions, valid_y, average = 'weighted') 
    ## Source : https://stackoverflow.com/questions/31421413/how-to-compute-precision-recall-accuracy-and-f1-score-for-the-multiclass-case

## Prediction using different features

In [None]:
# Naive Bayes on Count Vectors
score = train_model(naive_bayes.GaussianNB(), xtrain_count.toarray(), train_y, xvalid_count.toarray())
print("NB, Count Vectors: ", score)

# Naive Bayes on Word Level TF IDF Vectors
score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", score)

# Naive Bayes on Ngram Level TF IDF Vectors
score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", score)

# Naive Bayes on Character Level TF IDF Vectors
score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", score)
## END MODEL BUILDING AND PREDICTION

## Saving trained model to disk, along with vectorizer & label encoder

In [None]:
clf = naive_bayes.MultinomialNB()
clf.fit(xtrain_tfidf_ngram, train_y)

In [None]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vect_ngram, f, pickle.HIGHEST_PROTOCOL)
with open('clf.pkl', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

## Improving accuracy of the classifier
Source : https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle
* ~~Update the training set by pulling the latest dump from CMS~~
* ~~Change evaluation metric to multinomial log-loss / F1 score~~
* ~~Remove numbers~~ 
* ~~Add interpretability to the model~~
* ~~Logistic regression improved acc & F1 score from 69 to 73% for PHY~~
* ~~Try XGBoost classifier~~
* ~~Use Grid Search to optimize the parameters per subject including preprocessing params & model - On Kaggle - took a long time to finish. Kernel died midway~~
* ~~Try Auto ML on Kaggle~~