## Imports

In [32]:
# Sklearn module
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy, textblob, string

# Keras stuff -- Commented out as not used yet
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# Regex
import re

# Stopwords
import nltk
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')
import json

print(stop)
from pylatexenc.latex2text import LatexNodes2Text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pritamsukumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 

## Data Preparation

In [31]:

subject_to_check = 'MTH'
# Load the dataset
labels, texts = [], []
with open('../data/qs_topicwise.json') as json_data:
    all_questions = json.load(json_data)

words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", "value", "amp", "statement", "will", "equal", "number", "tan", "now", "can", "two", "get", "true", "lambda"]
# words_to_remove += stop

data_df = pd.DataFrame(columns=['curriculum', 'subject', 'question_text', 'chapter'])
questions = []
i = 0

# Regex pattern for keeping only alphabets and numbers
pattern = re.compile('[\W_]+')
nonutf8pattern = re.compile('[\u0080-\uffff]')

for question in all_questions:
    try: # So that python doesn't crash on individual question exceptions
        question_text = question['question_text'].lower()
        
        question_text = pattern.sub(" ", question_text)
        question_text = nonutf8pattern.sub(" ", question_text)


        # Remove extra whitespaces
        question_text = " ".join([word for word in question_text.split() if word not in words_to_remove])
        question_text = " ".join(question_text.split())

        
        # Keep only alphanumeric characters
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        if("JEE" in curriculum and subject in subject_to_check):
            data_df.loc[i] = [curriculum, subject, question_text, question['chapter']]
            i += 1
    except Exception as e:
        print(e)

trainDF = pd.DataFrame(columns=['text', 'label'])

# trainDF.replace(words_to_replace, "")
trainDF['text'] = data_df['question_text']
trainDF['label'] = data_df['chapter']

display(trainDF.head)


<bound method NDFrame.head of                                                    text  \
0     among the statements given below which one is ...   
1                                  sin 1 left 1 2 right   
2                      the principal domain of cos 𝑥 is   
3                          the principal domain of 𝑥 is   
4                                 1 left sin pi 2 right   
5                         sin 1 left sin 𝑥 right 𝑥 if 𝑥   
6     revision exercise 5 mins the students should a...   
7     what domain restrictions be imposed on the fun...   
8           the principal solutions of sin x frac12 are   
9                the general solution of sin theta 0 is   
10                                  if cos x cos y then   
11           if sin x 1 2 x in left 0 pi 2 right then x   
12      the of principal solutions of cos x 1 sqrt2 are   
13    cot 1 cos 1 cos x where alpha is some arbitrar...   
14      if cos 1 x cos 1 y2 then 4x 2 4xy cos y 2 is to   
15         the principal o

### Split into training and testing folds

In [34]:
# Split data into training and testing folds

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

# Label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

5583 1396


3989    paragraph for question numbers 15 to 17 x 2 y ...
6374    the term independent of 𝑥 in 1 𝑥 𝑚 left 1 1 x ...
4064    identify the quadrant and mention the correct ...
5524    if z cos left pi k right i sin left pi k right...
2094    if in an isosceles triangle with base a vertic...
515     the function f left x right left sin2x right 2...
5353    let pqrs be a rectangle of size 9 3 if it is f...
2856                            z log yx is equivalent to
4819    for k gt 0 the quadratic equation 2x 2 6x k 0 ...
175     the function f x sin 4x cos 2x is a periodic f...
4749    find the product of the roots of the quadratic...
6385    the middle term in the expansion of x frac1x 1...
3009    how many of the following functions have a max...
330     geometrically the derivative at any point is t...
930     if 𝐼 𝑎 𝑏𝑓 𝑥 𝑑𝑥 and we substitute 𝑥 2𝑡 which of...
5180    the coordinate axes is transformed in differen...
4115    if the arcs of the same lengths in circles sub...
1318    if vec

# Feature creation

In [23]:
# ----- FEATURE ENGINEERING -----
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', binary=True, max_features=3513)
X = count_vect.fit_transform(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)

xvalid_count =  count_vect.transform(valid_x)

In [24]:
x = count_vect.vocabulary_
y = list(x.keys() )
print(sorted(y))

['00', '000', '000001', '0001', '0002', '002', '01', '02', '025', '04x', '07', '0a', '0c', '0e', '0f', '0x', '0xe', '0z', '10', '100', '1000', '10000', '100000', '1000c', '1001', '100d', '100x', '101', '1011', '1011121314frequency', '1012', '102', '1024', '103', '104', '105', '106', '107', '108', '1080', '109', '10ax', '10b', '10cx', '10i', '10th', '10x', '10y', '11', '110', '11040', '111', '112', '115', '117', '119', '11b', '11d', '11i', '11if', '11t', '11x', '11y', '12', '120', '120g', '121', '1234', '125', '126', '127', '128', '12a', '12i', '12m', '12t', '12x', '12y', '12z', '13', '130', '130p', '135', '136', '1361', '137', '139', '13x', '14', '140', '1400', '1410', '1413', '144', '145', '1450', '1457', '149', '14a', '14x', '14y', '15', '150', '150x', '15101051', '152', '153', '1530', '155', '156', '157', '15b', '15i', '15th', '15x', '16', '160', '162', '164', '169', '16ax', '16q', '16x', '16y', '16z', '17', '1720', '175', '178', '17f', '17th', '17x', '18', '180', '1800', '183', '18

In [25]:
count_vect.stop_words_

{'3i2y',
 '3ix',
 '3q',
 'cota',
 'cotb',
 'cotc',
 'coterminous',
 'cotrapositive',
 'count',
 'counters',
 'countries',
 'country',
 'counts',
 'covering',
 'covert',
 'cpb',
 'cream',
 'creams',
 'crew',
 'cs',
 'cu',
 'cubical',
 'currency',
 'cutting',
 'cv',
 'dac',
 'daily',
 'dairy',
 'dalmatian',
 'dance',
 'dash',
 'dat',
 'dataclass',
 'dataheight',
 'daughters',
 'daynumber',
 'dayoption',
 'ddot',
 'deal',
 'dealt',
 'debate',
 'december',
 'decide',
 'deck',
 'decompose',
 'decrease',
 'defining',
 'deleted',
 'deliveries',
 'demand',
 'denomination',
 'dentoes',
 'depicts',
 'deposit',
 'deposited',
 'deposits',
 'depth',
 'derivation',
 'derive',
 'descending',
 'description',
 'design',
 'detector',
 'dg',
 'dicitionary',
 'differences',
 'differentiablenot',
 'differentials',
 'differentiating',
 'differentiations',
 'diffrential',
 'directionwe',
 'directly',
 'directories',
 'discarded',
 'discovered',
 'discrete',
 'disease',
 'dishonest',
 'disjointregion',
 'dist

In [26]:
count_vect

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
count_vect.get_params


<bound method BaseEstimator.get_params of CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)>

In [28]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)


In [29]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.GaussianNB(), xtrain_count.todense(), train_y, xvalid_count.todense())
print("NB, Count Vectors: ", accuracy)

NB, Count Vectors:  0.4305157593123209
