In [1]:
# Sklearn module
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from sklearn import decomposition, ensemble

from bs4 import BeautifulSoup, Tag    ## Cleaning HTML tags from text


import pandas as pd
import xgboost 
import numpy as np
#import textblob
#import string

# Keras stuff
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

import json     ## To covnert json raw data to df

import pickle   ## saving the model to disk
#from pylatexenc.latex2text import LatexNodes2Text
pd.set_option('display.max_colwidth', -1)    ## Problem texts can be long and may not load on Jupyter

## DATA PREPARATION

In [2]:
subject_to_check = 'CHM'
# Load the dataset
labels, texts = [], []
with open('../data/qs_topicwise.json') as json_data:
    all_questions = json.load(json_data)

all_questions[3]
## Need to update dataset

{'subject': 'MTH',
 'grade': '12',
 'curriculum': 'JEE',
 'chapter': 'Inverse Trigonometry ',
 'chapter_no': '18',
 'topic': 'Introduction to Inverse Trigonometry',
 'topic_no': '01',
 'difficulty': '1',
 'problem_code': 'P005930',
 'problem_status': 'final',
 'problem_mongo_id': '56f235d43562d97499000848',
 'problem_type': 'Spot Test',
 'options': ' \\(\\left[0, 2\\pi\\right]\\) \\(\\left[-\\frac{\\pi}2, \\frac{\\pi}2\\right]\\) \\(\\left[0, \\pi\\right]\\) \\(\\left[0, \\frac{\\pi}2\\right]\\)',
 'solution': '',
 'question_text': '\xa0The principal domain of \\(\\tan\u2061𝑥\\) is ___________\xa0'}

In [3]:
## Removing these words improves Phy clf accuracy by 2% but improves Math classifier accuracy
words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", "value", "amp", "statement", "will", "equal", "number", "tan", "now", "can", "two", "get", "true", "lambda"]

data_df = pd.DataFrame(columns=['curriculum', 'subject', 'question_text', 'chapter'])
data_df.head()

Unnamed: 0,curriculum,subject,question_text,chapter


In [4]:
questions = []
i = 0
for question in all_questions:
    #topic_code = question['topic_code']  ## Not in dataset anymore, already split
    try: 
        question_text = question['question_text'].lower()
        #question_text = BeautifulSoup(question_text, "html.parser").get_text() 
        ## Beautiful Soup improves accuracy from 40% to 60% in MTH, 
        ## but reduces PHY accuracy from 60% to 20%. For CHM, 
        ## it reduces accuracy from 49% to 47% 
        question_text = " ".join(question_text.split())
        for word in words_to_remove:
            question_text.replace(word, "")
        #splits = topic_code.split("-")
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        curr_question = {}
        if("JEE" in curriculum and grade in ["11", "12"] and subject in subject_to_check and "dummy" not in question_text):
            data_df.loc[i] = [curriculum, subject, question_text, question['chapter']]
            i += 1
    except:
            pass

trainDF = pd.DataFrame(columns=['text', 'label'])
# trainDF.replace(words_to_replace, "")
trainDF['text'] = data_df['question_text']
trainDF['label'] = data_df['chapter']
trainDF.head(3)

Unnamed: 0,text,label
0,which of the following statements is incorrect?,Solutions
1,dry air was successively passed through a solution of \(5\) \(g\) solute in \(90\) \(g\) water and through pure water. the loss in weight of solution was \(2.5\) \(g\) and that of pure water was \(0.05\) \(g.\) molecular weight of solute in \(g/mol\) is \(m.\) find the value of \(\frac{m}{10}.\)(assume that solute is non-volatile and does not dissociate or associate),Solutions
2,an aqueous solution boils at \(374\: k\). what is the freezing point of the same solution?(given \(k_f=1.86^\circ c/m\) and \(k_b=0.51^\circ c/m\) ),Solutions


## Split into training & validation sets

In [5]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

print(len(train_x), len(valid_x) )

8815 2204


## Label encode the target variable (for multi label classification)

In [6]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
encoder.classes_

array(['Alcohols, Ethers and Phenols', 'Aldehydes and Ketones', 'Amines',
       'Atomic Structure ', 'Basic Concepts of Chemistry',
       'Biomolecules, Polymers and Chemistry in Everyday life ',
       'Carboxylic Acids and Derivatives', 'Chemical Bonding ',
       'Chemical Equilibrium', 'Chemical Kinetics',
       'Coordination Compounds', 'D and F Block Elements', 'Dummy',
       'Electrochemistry', 'Environmental Chemistry',
       'Halogen Derivatives ', 'Hydrocarbons', 'Hydrogen',
       'Introduction to Organic Chemistry', 'Ionic Equilibrium',
       'Metallurgy', 'Nuclear Chemistry', 'P - Block Elements - II',
       'P block - I', 'Periodic Properties of Elements',
       'Redox Reactions', 'Repository', 'S block elements',
       'Selection Test', 'Solid State', 'Solutions', 'States of Matter',
       'Surface Chemistry', 'Thermodynamics'], dtype=object)

In [7]:
train_y[0]

16

In [8]:
valid_y = encoder.fit_transform(valid_y)
valid_y[0]

23

In [9]:
encoder.inverse_transform([valid_y[0]])

array(['P block - I'], dtype=object)

## FEATURE ENGINEERING

In [10]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtrain_count[xtrain_count != 0] = 1
print(xtrain_count)

xvalid_count =  count_vect.transform(valid_x)
xvalid_count[xvalid_count != 0] = 1

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

  (0, 3480)	1
  (0, 4785)	1
  (0, 5584)	1
  (0, 5811)	1
  (0, 6397)	1
  (0, 8038)	1
  (0, 9539)	1
  (1, 1833)	1
  (1, 1884)	1
  (1, 2366)	1
  (1, 4282)	1
  (1, 4813)	1
  (1, 5584)	1
  (1, 6709)	1
  (1, 7263)	1
  (1, 7330)	1
  (1, 8443)	1
  (1, 8519)	1
  (1, 9537)	1
  (1, 9539)	1
  (2, 1359)	1
  (2, 1833)	1
  (2, 2516)	1
  (2, 4139)	1
  (2, 4183)	1
  :	:
  (8811, 7263)	1
  (8811, 7452)	1
  (8812, 2361)	1
  (8812, 4357)	1
  (8812, 4793)	1
  (8812, 5275)	1
  (8812, 5811)	1
  (8813, 4785)	1
  (8813, 5381)	1
  (8813, 5584)	1
  (8813, 5811)	1
  (8813, 6540)	1
  (8813, 6944)	1
  (8813, 7263)	1
  (8813, 9192)	1
  (8813, 9539)	1
  (8813, 10135)	1
  (8814, 4785)	1
  (8814, 5811)	1
  (8814, 6397)	1
  (8814, 7263)	1
  (8814, 8038)	1
  (8814, 8290)	1
  (8814, 8804)	1
  (8814, 9539)	1


## MODEL BUILDING AND PREDICTION

In [11]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, valid_y)
    #return metrics.f1_score(predictions, valid_y) ##?

## Prediction using different features

In [12]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.GaussianNB(), xtrain_count.toarray(), train_y, xvalid_count.toarray())
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)
## END MODEL BUILDING AND PREDICTION

NB, Count Vectors:  0.4941016333938294
NB, WordLevel TF-IDF:  0.5186025408348457
NB, N-Gram Vectors:  0.4591651542649728
NB, CharLevel Vectors:  0.47595281306715065


## Saving trained model to disk, along with vectorizer & label encoder

In [13]:
clf = naive_bayes.MultinomialNB()
clf.fit(xtrain_tfidf_ngram, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vect_ngram, f, pickle.HIGHEST_PROTOCOL)
with open('clf.pkl', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

## Reading pickled models

In [15]:
with open('vectorizer.pkl', 'rb') as f:
    tfidf_vect_ngram = pickle.load(f)
with open('clf.pkl', 'rb') as f:
    clf = pickle.load(f)
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

## Testing pickled models

In [16]:
def chapter_clf_model(text):                   ## This combined function could not be pickled !!
    text = [text]                              ## Convert input string to a list which is an iterable needed for tf-idf
    feat = tfidf_vect_ngram.transform(text)    ## Convert text to tfidf matrix
    pred = clf.predict(feat)                   ## Predict label of chapter
    cname = encoder.inverse_transform(pred)    ## Convert label to chapter name
    return ''.join(cname)                      ## Converting array prediction to a single string

In [17]:
text = 'what is the heat capacity of plastic'

In [18]:
## Testing the complete model
chapter_clf_model(text)

'Thermodynamics'

## Improving accuracy of the classifier
Source : https://www.kaggle.com/abhishek/approaching-almost-any-nlp-problem-on-kaggle
* Change evaluation metric to multinomial log-loss / F1 score ?
* Update the training set by pulling the latest dump from CMS
* Try XGBoost classifier
* Use Grid Search to optimize the parameters
