In [1]:
# Sklearn module
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#from sklearn import decomposition, ensemble

from bs4 import BeautifulSoup, Tag    ## Cleaning HTML tags from text


import pandas as pd
#import xgboost 
#import numpy as np
#import textblob
#import string

# Keras stuff
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers

import json     ## To covnert json raw data to df

import pickle   ## saving the model to disk
#from pylatexenc.latex2text import LatexNodes2Text
pd.set_option('display.max_colwidth', -1)    ## Problem texts can be long and may not load on Jupyter

## DATA PREPARATION

In [2]:
subject_to_check = 'MTH'
# Load the dataset
labels, texts = [], []
with open('../data/qs_topicwise.json') as json_data:
    all_questions = json.load(json_data)

all_questions[3]
## Need to update dataset

{'subject': 'MTH',
 'grade': '12',
 'curriculum': 'JEE',
 'chapter': 'Inverse Trigonometry ',
 'chapter_no': '18',
 'topic': 'Introduction to Inverse Trigonometry',
 'topic_no': '01',
 'difficulty': '1',
 'problem_code': 'P005930',
 'problem_status': 'final',
 'problem_mongo_id': '56f235d43562d97499000848',
 'problem_type': 'Spot Test',
 'options': ' \\(\\left[0, 2\\pi\\right]\\) \\(\\left[-\\frac{\\pi}2, \\frac{\\pi}2\\right]\\) \\(\\left[0, \\pi\\right]\\) \\(\\left[0, \\frac{\\pi}2\\right]\\)',
 'solution': '',
 'question_text': '\xa0The principal domain of \\(\\tan\u2061𝑥\\) is ___________\xa0'}

In [3]:
## Removing these words improves Phy clf accuracy by 2% but improves Math classifier accuracy
words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", "value", "amp", "statement", "will", "equal", "number", "tan", "now", "can", "two", "get", "true", "lambda"]

data_df = pd.DataFrame(columns=['curriculum', 'subject', 'question_text', 'chapter'])
data_df.head()

Unnamed: 0,curriculum,subject,question_text,chapter


In [4]:
questions = []
i = 0
for question in all_questions:
    #topic_code = question['topic_code']  ## Not in dataset anymore, already split
    try: 
        question_text = question['question_text'].lower()
      #  question_text = BeautifulSoup(question_text, "html.parser").get_text()   ## Reduces accuracy a bit as compared to words_to_remove
        question_text = " ".join(question_text.split())
        for word in words_to_remove:
            question_text.replace(word, "")
        #splits = topic_code.split("-")
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        curr_question = {}
        if("JEE" in curriculum and grade in ["11", "12"] and subject in subject_to_check and "dummy" not in question_text):
            data_df.loc[i] = [curriculum, subject, question_text, question['chapter']]
            i += 1
    except:
            pass

trainDF = pd.DataFrame(columns=['text', 'label'])
# trainDF.replace(words_to_replace, "")
trainDF['text'] = data_df['question_text']
trainDF['label'] = data_df['chapter']
trainDF.head(3)

Unnamed: 0,text,label
0,"among the statements given below, which one is...",Inverse Trigonometry
1,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________,Inverse Trigonometry
2,the principal domain of \(\cos⁡𝑥\) is ___________,Inverse Trigonometry


## Split into training & validation sets

In [5]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

print(len(train_x), len(valid_x) )

5532 1384


## Label encode the target variable (for multi label classification)

In [6]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
encoder.classes_

array(['3 Dimensional Geometry', 'Applications of Derivatives',
       'Binomial Theorem', 'Complex Numbers', 'Conic Sections - I',
       'Conic Sections - II', 'Continuity and Differentiability',
       'Definite Integration', 'Differential Equations', 'Functions 2',
       'Fundamentals of Mathematics', 'Indefinite Integration',
       'Inequalities', 'Inverse Trigonometry ', 'Limits',
       'M1.1 Scaffold test', 'Mathematical Reasoning',
       'Matrices and Determinants', 'Permutations and Combinations',
       'Principle of Mathematical Induction', 'Probability',
       'Quadratic Equations', 'Repository', 'Selection Test',
       'Sequence and Series', 'Sets, Relations and Functions',
       'Statistics', 'Straight Lines', 'Tally Marks', 'Trigonometry',
       'Vector Algebra'], dtype=object)

In [7]:
train_y[0]

21

In [8]:
valid_y = encoder.fit_transform(valid_y)
valid_y[0]

22

In [9]:
encoder.inverse_transform([valid_y[0]])

array(['Repository'], dtype=object)

## FEATURE ENGINEERING

In [111]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtrain_count[xtrain_count != 0] = 1
print(xtrain_count)

xvalid_count =  count_vect.transform(valid_x)
xvalid_count[xvalid_count != 0] = 1

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

  (0, 994)	1
  (0, 1045)	1
  (0, 1183)	1
  (0, 2379)	1
  (0, 2522)	1
  (0, 2714)	1
  (0, 2847)	1
  (0, 2849)	1
  (0, 3287)	1
  (0, 3634)	1
  (0, 3934)	1
  (0, 4189)	1
  (0, 4191)	1
  (0, 4196)	1
  (0, 4333)	1
  (1, 0)	1
  (1, 12)	1
  (1, 350)	1
  (1, 463)	1
  (1, 550)	1
  (1, 839)	1
  (1, 994)	1
  (1, 1045)	1
  (1, 1094)	1
  (1, 1105)	1
  :	:
  (5531, 994)	1
  (5531, 1105)	1
  (5531, 1183)	1
  (5531, 1258)	1
  (5531, 1265)	1
  (5531, 1798)	1
  (5531, 1917)	1
  (5531, 2060)	1
  (5531, 2112)	1
  (5531, 2230)	1
  (5531, 2292)	1
  (5531, 2522)	1
  (5531, 2572)	1
  (5531, 2605)	1
  (5531, 2710)	1
  (5531, 2714)	1
  (5531, 3287)	1
  (5531, 3320)	1
  (5531, 3420)	1
  (5531, 4189)	1
  (5531, 4232)	1
  (5531, 4238)	1
  (5531, 4250)	1
  (5531, 4520)	1
  (5531, 4561)	1


## MODEL BUILDING AND PREDICTION

In [112]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, valid_y)
    #return metrics.f1_score(predictions, valid_y) ##?

## Prediction

In [113]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.GaussianNB(), xtrain_count.toarray(), train_y, xvalid_count.toarray())
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)
## END MODEL BUILDING AND PREDICTION

NB, Count Vectors:  0.5
NB, WordLevel TF-IDF:  0.6004335260115607
NB, N-Gram Vectors:  0.619942196531792
NB, CharLevel Vectors:  0.6054913294797688


## Saving trained model to disk, along with vectorizer & label encoder

In [None]:
clf = naive_bayes.MultinomialNB()
clf.fit(xtrain_tfidf_ngram, train_y)

In [None]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vect_ngram, f, pickle.HIGHEST_PROTOCOL)
with open('clf.pkl', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

## Reading pickled models

In [None]:
with open('vectorizer.pkl', 'rb') as f:
    tfidf_vect_ngram = pickle.load(f)
with open('clf.pkl', 'rb') as f:
    clf = pickle.load(f)
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

## Testing pickled models

In [None]:
def chapter_clf_model(text):                   ## This combined function could not be pickled !!
    text = [text]                              ## Convert input string to a list which is an iterable needed for tf-idf
    feat = tfidf_vect_ngram.transform(text)    ## Convert text to tfidf matrix
    pred = clf.predict(feat)                   ## Predict label of chapter
    cname = encoder.inverse_transform(pred)    ## Convert label to chapter name
    return ''.join(cname)                      ## Converting array prediction to a single string

In [None]:
text = 'what is the heat capacity of plastic'

In [None]:
## Testing the complete model
chapter_clf_model(text)