We have explored other similarity metrics in the notebook base_similarity_models.ipynb
Cosine similarity was the most relevant metric found.
Here we will use sklearn instead of gensim and add tf-idf and n-grams to the model which should improve text match

In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords 
from bs4 import BeautifulSoup, Tag    ## Cleaning HTML tags from text


import pandas as pd
import numpy as np
import json     ## To covnert json raw data to df

import pickle   ## saving the model to disk
pd.set_option('display.max_colwidth', -1)    ## Problem texts can be long and may not load on Jupyter

#### Reading problems repo

In [2]:
with open('data/qs_topicwise.json') as json_data:
    Qs = json.load(json_data)

Qs[1]

{'subject': 'MTH',
 'grade': '12',
 'curriculum': 'JEE',
 'chapter': 'Inverse Trigonometry ',
 'chapter_no': '18',
 'topic': 'Introduction to Inverse Trigonometry',
 'topic_no': '01',
 'difficulty': '1',
 'problem_code': 'P005928',
 'problem_status': 'final',
 'problem_mongo_id': '56f2348c3562d9749900083a',
 'problem_type': 'Spot Test',
 'options': ' \\(\\frac{\\pi}2\\) \\(\\frac{\\pi}4\\) \\(\\frac{\\pi}3\\) \\(\\frac{\\pi}6\\)',
 'solution': '',
 'question_text': '\\(\\sin^{−1}\\left(\u2061\\frac{1}{√2}\\right)=\\)________'}

#### Converting to tabular & filtering irrelevant questions

In [3]:
repo_df = pd.DataFrame(columns=['curriculum', 'subject', 'grade', 'chapter', 'problem_code',
                                'problem_type','question_text'])
repo_df.head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text


In [4]:
questions = []
i = 0
for question in Qs:
    #topic_code = question['topic_code']  ## Not in dataset anymore, already split
    try: 
        question_text = question['question_text'].lower()
        question_text = BeautifulSoup(question_text, "html.parser").get_text()   ## Clean HTMl tags
        question_text = " ".join(question_text.split())
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        curr_question = {}
        if(curriculum in ["CBSE", "JEE"] and grade in ["9", "10", "11", "12"] and "dummy" not in question_text):
            repo_df.loc[i] = [curriculum, subject, grade, question['chapter'], question['problem_code'],
                              question['problem_type'], question_text]
            i += 1
    except:
            pass

repo_df.head(3)

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text
0,JEE,MTH,12,Inverse Trigonometry,P000321,ConcepTest,"among the statements given below, which one is correct?"
1,JEE,MTH,12,Inverse Trigonometry,P005928,Spot Test,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________
2,JEE,MTH,12,Inverse Trigonometry,P005929,Spot Test,the principal domain of \(\cos⁡𝑥\) is ___________


In [5]:
repo_df.shape

(21067, 7)

### Converting sentences to vectors

Limiting dictionary to 1 subject

In [6]:
repo_df = repo_df[repo_df['subject'] == "PHY"]
repo_df.shape

(7208, 7)

#### Custom tokenizer (optional) for stemming, tokenizing & punctuation removal
If we add this tokenizer, we need to save it as another pickled model

In [7]:
import nltk, string
#words_to_remove = ["rightarrow", "hence", "frac", "text", "sqrt", "times", "value", "amp", "equal", "left", "right"]

stemmer = nltk.stem.porter.PorterStemmer()    ## May need to download this with "nltk.download('punkt')"
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.translate(remove_punctuation_map)))

In [8]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3),
                                   lowercase=True, tokenizer=None, max_features=10000, 
                                   use_idf = True, min_df = 5, max_df = 0.8)
## Min 5 occurences, max 80% tf in docs
tfidf_vect_ngram.fit(repo_df['question_text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=10000, min_df=5,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

## Serializing the model to disk

In [9]:
with open('sim_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vect_ngram, f, pickle.HIGHEST_PROTOCOL)
with open('sim_question_repo.pkl', 'wb') as f:
    pickle.dump(repo_df, f, pickle.HIGHEST_PROTOCOL)

## Testing with a pickled model

In [10]:
with open('sim_vectorizer.pkl', 'rb') as f:
    tfidf_vect_ngram = pickle.load(f)
with open('sim_question_repo.pkl', 'rb') as f:
    repo_df = pickle.load(f)

In [11]:
repo_tfidf_ngram = tfidf_vect_ngram.transform(repo_df['question_text'])
repo_tfidf_ngram

<7208x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 227061 stored elements in Compressed Sparse Row format>

### Calculating similarity with a given text input

In [12]:
test_q = 'Calculate the time of flight of a ball launched with a velocty of 5 m / s at an angle of 30 degrees'
test_matrix = tfidf_vect_ngram.transform([test_q])
test_matrix

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [13]:
cos_sm = cosine_similarity(test_matrix, repo_tfidf_ngram).flatten()
## Source : https://intellipaat.com/community/1103/python-tf-idf-cosine-to-find-document-similarity
cos_sm[0:6]

array([0.        , 0.01285961, 0.        , 0.        , 0.        ,
       0.00702585])

### Finding top 5 closest matches

In [14]:
top5 = cos_sm.argsort()[:-6:-1]
print(top5)             ## Row indices
print(cos_sm[top5])     ## Actual cosine similarities

[4584 2850 4602 2638 4593]
[0.35621971 0.33996381 0.30860128 0.27278144 0.27192084]


In [15]:
repo_df.iloc[top5][repo_df['chapter'] == 'Motion in Two Dimensions']

  """Entry point for launching an IPython kernel.


Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text
13991,JEE,PHY,11,Motion in Two Dimensions,P030411,In Class Test,a ball is projected from a top of a building of height \(20 m\) with velocity \(20 m/s\) at an angle of \(-30^0\) with the horizontal. what is the range of a ball\(?\)
14009,JEE,PHY,11,Motion in Two Dimensions,P030487,In Class Test,a ball is moving with velocity \(5 m/s\) in horizontal direction. and another ball is approaching towards ball with velocity \(5 m/s\) at an angle \(45^0\). then what is the net velocity of approach between two balls?
14000,JEE,PHY,11,Motion in Two Dimensions,P000892,Recall Test,"for a ground to ground projectile motion, the expression for the time of flight is given by"
