## Imports

In [1]:
# Sklearn module
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy, textblob, string

# Keras stuff -- Commented out as not used yet
# from keras.preprocessing import text, sequence
# from keras import layers, models, optimizers

# Regex
import re

# Stopwords
import nltk
nltk.download('stopwords')
stop = nltk.corpus.stopwords.words('english')
import json

from pylatexenc.latex2text import LatexNodes2Text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pritamsukumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preparation

In [6]:

subject_to_check = 'MTH'
# Load the dataset
labels, texts = [], []
with open('../data/qs_topicwise.json') as json_data:
    all_questions = json.load(json_data)

words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", "value", "amp", "statement", "will", "equal", "number", "tan", "now", "can", "two", "get", "true", "lambda"]
# words_to_remove += stop

data_df = pd.DataFrame(columns=['curriculum', 'subject', 'question_text', 'chapter'])
questions = []
i = 0

# Regex pattern for keeping only alphabets and numbers
pattern = re.compile('[^[:alnum:]]+')
nonutf8pattern = re.compile('[\u0080-\uffff]')
questions = all_questions[1:2]

for question in all_questions:
    try: # So that python doesn't crash on individual question exceptions
        question_text = question['question_text'].lower()
        
        question_text = pattern.sub(" ", question_text)
        question_text = nonutf8pattern.sub(" ", question_text)


        # Remove extra whitespaces
        question_text = " ".join([word for word in question_text.split() if word not in words_to_remove])
        question_text = " ".join(question_text.split())

        
        # Keep only alphanumeric characters
        
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        curr_question = {}
        if("JEE" in curriculum and subject in subject_to_check):
            data_df.loc[i] = [curriculum, subject, question_text, question['chapter']]
            i += 1
    except Exception as e:
        print(e)

trainDF = pd.DataFrame(columns=['text', 'label'])

# trainDF.replace(words_to_replace, "")
trainDF['text'] = data_df['question_text']
trainDF['label'] = data_df['chapter']


### Split into training and testing folds

In [16]:
# Split data into training and testing folds

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

print(len(train_x), len(valid_x) )

# Label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


5583 1396


# Feature creation

In [17]:
# ----- FEATURE ENGINEERING -----
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', binary=True, max_features=3500)
X = count_vect.fit_transform(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)

xvalid_count =  count_vect.transform(valid_x)

In [18]:
x = count_vect.vocabulary_
y = list(x.keys() )
print(sorted(y))

['0', '00', '000', '000001', '0001', '0002', '002', '01', '02', '025', '07', '0f', '1', '10', '100', '1000', '10000', '100000', '1000c_', '1001', '100d', '100x', '101', '1011', '1011121314frequency', '1012', '102', '1024', '103', '104', '105', '106', '107', '108', '1080', '109', '10ax', '10b', '10cx', '10i', '10th', '10x', '10y', '11', '110', '11040', '111', '112', '115', '117', '119', '11b', '11d', '11i', '11if', '11t', '11x', '11y', '12', '120', '120g', '121', '1234', '125', '126', '127', '128', '12a', '12i', '12m', '12t', '12x', '12y', '12z', '13', '130', '130p_2', '135', '136', '1361', '137', '139', '13x', '14', '140', '1400', '1410', '1413', '144', '145', '1450', '1457', '149', '14a', '14x', '14y', '15', '150', '150x', '15101051', '152', '153', '1530', '155', '156', '157', '15b', '15i', '15th', '15x', '16', '160', '162', '164', '169', '16ax', '16q', '16x', '16y', '16z_2z_3z_4', '17', '1720', '175', '178', '17f', '17th', '17x', '18', '180', '1800', '183', '185', '18e', '18i', '18x'

In [19]:
count_vect.stop_words_

{'cochin',
 'erasing',
 'languages',
 'leads',
 'frac1k',
 '6k',
 'paths',
 'hens',
 '680',
 'infer',
 'nj',
 'leq8',
 'df_2',
 'crew',
 'oi',
 'split',
 'activitythere',
 'bm',
 'convex',
 'legal',
 'accessible',
 'log_ka',
 'propertyif',
 'narrow',
 '2𝜃',
 'buddh',
 'ibt',
 'occupied',
 'a_1y',
 'madrid',
 '2𝑏',
 '850',
 'indigo',
 'a_1a_',
 'cms',
 'iiconsider',
 'absence',
 '1figure',
 'delta_3',
 'log_28',
 'adjacently',
 '729',
 'gave',
 'bell',
 'distributionclasses',
 'cars',
 '9e',
 'spanish',
 'election',
 'multiply',
 'hands',
 'k2',
 'pessimistic',
 'gd',
 'licence',
 '93',
 'mc_0',
 'extended',
 'marathi',
 'staircase',
 'g_1g_2',
 'junction',
 'hypotenuse',
 'lemons',
 'spot',
 'log_n128',
 '_0e',
 'arctan',
 'psi',
 'abcx',
 'agrees',
 '96',
 'orders',
 '_____________________',
 'rapidly',
 'b_s',
 'hh',
 'dealt',
 'kq',
 'oabcd',
 'algebra',
 'encounter',
 'log_3a',
 'dance',
 'applicants',
 'closest',
 'male',
 'weekii',
 'log_20',
 'cos2048',
 'differentiablenot',
 't

In [20]:
count_vect

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [21]:
count_vect.get_params


<bound method BaseEstimator.get_params of CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)>

In [22]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    return metrics.accuracy_score(predictions, valid_y)


In [23]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.GaussianNB(), xtrain_count.todense(), train_y, xvalid_count.todense())
print("NB, Count Vectors: ", accuracy)

NB, Count Vectors:  0.4806590257879656
