### Source : https://www.kaggle.com/tj2552/similarity-techniques-nlp

In [1]:
import pandas as pd
import numpy as np
import json                           ## To convert json raw data to df
from bs4 import BeautifulSoup, Tag    ## Cleaning HTML tags from text

import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords    ## Need to download stopwords using '''nltk.download('stopwords')'''
from nltk.stem.porter import *

In [2]:
with open('data/qs_topicwise.json') as json_data:
    Qs = json.load(json_data)

Qs[1]

{'subject': 'MTH',
 'grade': '12',
 'curriculum': 'JEE',
 'chapter': 'Inverse Trigonometry ',
 'chapter_no': '18',
 'topic': 'Introduction to Inverse Trigonometry',
 'topic_no': '01',
 'difficulty': '1',
 'problem_code': 'P005928',
 'problem_status': 'final',
 'problem_mongo_id': '56f2348c3562d9749900083a',
 'problem_type': 'Spot Test',
 'options': ' \\(\\frac{\\pi}2\\) \\(\\frac{\\pi}4\\) \\(\\frac{\\pi}3\\) \\(\\frac{\\pi}6\\)',
 'solution': '',
 'question_text': '\\(\\sin^{−1}\\left(\u2061\\frac{1}{√2}\\right)=\\)________'}

In [3]:
data_df = pd.DataFrame(columns=['curriculum', 'subject', 'grade', 'chapter', 'problem_code',
                                'problem_type','question_text'])
data_df.head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text


In [4]:
questions = []
i = 0
for question in Qs:
    #topic_code = question['topic_code']  ## Not in dataset anymore, already split
    try: 
        question_text = question['question_text'].lower()
        question_text = BeautifulSoup(question_text, "html.parser").get_text()   ## Clean HTMl tags
        question_text = " ".join(question_text.split())
        subject = question['subject']
        curriculum = question['curriculum']
        grade = question['grade']
        curr_question = {}
        if(curriculum in ["CBSE", "JEE"] and grade in ["9", "10", "11", "12"] and "dummy" not in question_text):
            data_df.loc[i] = [curriculum, subject, grade, question['chapter'], question['problem_code'],
                              question['problem_type'], question_text]
            i += 1
    except:
            pass

data_df.head(3)

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text
0,JEE,MTH,12,Inverse Trigonometry,P000321,ConcepTest,"among the statements given below, which one is..."
1,JEE,MTH,12,Inverse Trigonometry,P005928,Spot Test,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________
2,JEE,MTH,12,Inverse Trigonometry,P005929,Spot Test,the principal domain of \(\cos⁡𝑥\) is ___________


In [5]:
data_df.shape

(21067, 7)

In [6]:
words = re.compile(r"\w+",re.I)
stopword = stopwords.words('english')
stemmer = PorterStemmer()

def tokenize_questions(df):
    question_tokenized = []
   
    for q in df.question_text.tolist():
        question_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    df["question_tokenized"] = question_tokenized
    
    return df

In [7]:
def token_dictionary(df):
    
    questions_tokenized = df.question_tokenized.tolist()
    
    dictionary = corpora.Dictionary(questions_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary

In [8]:
data_df = tokenize_questions(data_df)
data_df.head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text,question_tokenized
0,JEE,MTH,12,Inverse Trigonometry,P000321,ConcepTest,"among the statements given below, which one is...","[among, statement, given, one, correct]"
1,JEE,MTH,12,Inverse Trigonometry,P005928,Spot Test,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________,"[sin, 1, left, frac, 1, 2, right, ________]"
2,JEE,MTH,12,Inverse Trigonometry,P005929,Spot Test,the principal domain of \(\cos⁡𝑥\) is ___________,"[princip, domain, co, 𝑥, ___________]"
3,JEE,MTH,12,Inverse Trigonometry,P005930,Spot Test,the principal domain of \(\tan⁡𝑥\) is ___________,"[princip, domain, tan, 𝑥, ___________]"
4,JEE,MTH,12,Inverse Trigonometry,P005931,Spot Test,\(\tan^{-1}\left(\sin\frac{\pi}2\right)=\),"[tan, 1, left, sin, frac, pi, 2, right]"


In [9]:
dictionary = token_dictionary(data_df)
print ("No of words in the dictionary = %s" %len(dictionary.token2id))

No of words in the dictionary = 3583


This would be the size of each of the vector in the question set.

### Create Vector
Here we are using the simple method of Bag Of Words Technique to convert sentences into vectors. There are two vector matrices thus created where each of the matrix is a sparse matrix to save memory in the system.

In [10]:
question_vec = [dictionary.doc2bow(text) for text in data_df.question_tokenized.tolist()]
    
question_csc = gensim.matutils.corpus2csc(question_vec, num_terms=len(dictionary.token2id))
    
q_csc = question_csc.transpose()

print (q_csc.shape)

(21067, 3583)


## Create test question vector to check similarity against the existing repo

In [29]:
test_q = 'Calculate the time of flight of a ball launched with a velocty of 5 m / s at an angle of 30 degrees'
data_df['test_q'] = test_q

#### Tokenizing test question

In [30]:
test_q_tokenized = []
test_q_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(test_q) if i not in stopword])
test_q_tokenized

[['calcul',
  'time',
  'flight',
  'ball',
  'launch',
  'velocti',
  '5',
  'angl',
  '30',
  'degre']]

In [31]:
test_q_tokenized *= len(data_df)   ## Duplcating item in the list to match length of df

In [32]:
data_df['test_q_tok'] = test_q_tokenized
data_df.head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text,question_tokenized,test_q,test_q_tok,cosine_sim,jaccard_sim,manhattan_dis,eucledian_dis,minkowsk_dis
0,JEE,MTH,12,Inverse Trigonometry,P000321,ConcepTest,"among the statements given below, which one is...","[among, statement, given, one, correct]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.03012,0.021957,0.021957
1,JEE,MTH,12,Inverse Trigonometry,P005928,Spot Test,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________,"[sin, 1, left, frac, 1, 2, right, ________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.048193,0.038806,0.038806
2,JEE,MTH,12,Inverse Trigonometry,P005929,Spot Test,the principal domain of \(\cos⁡𝑥\) is ___________,"[princip, domain, co, 𝑥, ___________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.03012,0.021957,0.021957
3,JEE,MTH,12,Inverse Trigonometry,P005930,Spot Test,the principal domain of \(\tan⁡𝑥\) is ___________,"[princip, domain, tan, 𝑥, ___________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.03012,0.021957,0.021957
4,JEE,MTH,12,Inverse Trigonometry,P005931,Spot Test,\(\tan^{-1}\left(\sin\frac{\pi}2\right)=\),"[tan, 1, left, sin, frac, pi, 2, right]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.048193,0.032466,0.032466


#### Creating vector for test question

In [33]:
test_question_vec = [dictionary.doc2bow(text) for text in data_df.test_q_tok.tolist()]
    
test_question_csc = gensim.matutils.corpus2csc(test_question_vec, num_terms=len(dictionary.token2id))
    
test_q_csc = test_question_csc.transpose()

### Define Similarity Calculation Fucntions
- Cosine Distance
- Euclidean Distance
- Manhattan Distance
- Jaccard Distance
- Minkowski Distance

In [34]:
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.metrics.pairwise import manhattan_distances as md
from sklearn.metrics.pairwise import euclidean_distances as ed
from sklearn.metrics import jaccard_similarity_score as js
from sklearn.neighbors import DistanceMetric
from sklearn.preprocessing import MinMaxScaler

minkowski_dis = DistanceMetric.get_metric('minkowski')
mms_scale_man = MinMaxScaler()
mms_scale_euc = MinMaxScaler()
mms_scale_mink = MinMaxScaler()

In [35]:
def get_similarity_values(q1_csc, q2_csc):
    cosine_sim = []
    jaccard_sim = []
    manhattan_dis = []
    eucledian_dis = []
    minkowsk_dis = []
    
    for i,j in zip(q1_csc, q2_csc):
        sim = cs(i,j)
        cosine_sim.append(sim[0][0])
        sim = md(i,j)
        manhattan_dis.append(sim[0][0])
        sim = ed(i,j)
        eucledian_dis.append(sim[0][0])
        i_ = i.toarray()
        j_ = j.toarray()
        try:
            sim = js(i_,j_)
            jaccard_sim.append(sim)
        except:
            jaccard_sim.append(0)
            
        sim = minkowski_dis.pairwise(i_,j_)
        minkowsk_dis.append(sim[0][0])
    
    return cosine_sim, jaccard_sim, manhattan_dis, eucledian_dis, minkowsk_dis

### Calculating similarity between test question & the entire repo

In [36]:
cosine_sim, jaccard_sim, manhattan_dis, eucledian_dis, minkowsk_dis = get_similarity_values(q_csc, test_q_csc)

In [37]:
print ("cosine_sim sample= \n", cosine_sim[0:5])
print ("jaccard_sim sample = \n", jaccard_sim[0:5])

cosine_sim sample= 
 [0.0, 0.0, 0.0, 0.0, 0.0]
jaccard_sim sample = 
 [0.0, 0, 0.0, 0.0, 0.0]


As Eucledian, Manhattan and Minkowski Distance may go beyond 1 we must scale them down between0 - 1 , for that we are using MinMaxScaler and training them on training data.

In [38]:
eucledian_dis_array = np.array(eucledian_dis).reshape(-1,1)
manhattan_dis_array = np.array(manhattan_dis).reshape(-1,1)
minkowsk_dis_array = np.array(minkowsk_dis).reshape(-1,1)
    
manhattan_dis_array = mms_scale_man.fit_transform(manhattan_dis_array)
eucledian_dis_array = mms_scale_euc.fit_transform(eucledian_dis_array)
minkowsk_dis_array = mms_scale_mink.fit_transform(minkowsk_dis_array)

eucledian_dis = eucledian_dis_array.flatten()
manhattan_dis = manhattan_dis_array.flatten()
minkowsk_dis = minkowsk_dis_array.flatten()

print ("manhattan_dis sample = \n", manhattan_dis[0:5])
print ("eucledian_dis sample = \n", eucledian_dis[0:5])
print ("minkowsk_dis sample = \n", minkowsk_dis[0:5])

manhattan_dis sample = 
 [0.0304878  0.04878049 0.0304878  0.0304878  0.04878049]
eucledian_dis sample = 
 [0.01788701 0.0327734  0.01788701 0.01788701 0.02708663]
minkowsk_dis sample = 
 [0.01788701 0.0327734  0.01788701 0.01788701 0.02708663]


In [39]:
data_df['cosine_sim'] = cosine_sim
data_df['jaccard_sim'] = jaccard_sim
data_df['manhattan_dis'] = manhattan_dis
data_df['eucledian_dis'] = eucledian_dis
data_df['minkowsk_dis'] = minkowsk_dis

data_df.head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text,question_tokenized,test_q,test_q_tok,cosine_sim,jaccard_sim,manhattan_dis,eucledian_dis,minkowsk_dis
0,JEE,MTH,12,Inverse Trigonometry,P000321,ConcepTest,"among the statements given below, which one is...","[among, statement, given, one, correct]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.030488,0.017887,0.017887
1,JEE,MTH,12,Inverse Trigonometry,P005928,Spot Test,\(\sin^{−1}\left(⁡\frac{1}{√2}\right)=\)________,"[sin, 1, left, frac, 1, 2, right, ________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.04878,0.032773,0.032773
2,JEE,MTH,12,Inverse Trigonometry,P005929,Spot Test,the principal domain of \(\cos⁡𝑥\) is ___________,"[princip, domain, co, 𝑥, ___________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.030488,0.017887,0.017887
3,JEE,MTH,12,Inverse Trigonometry,P005930,Spot Test,the principal domain of \(\tan⁡𝑥\) is ___________,"[princip, domain, tan, 𝑥, ___________]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.030488,0.017887,0.017887
4,JEE,MTH,12,Inverse Trigonometry,P005931,Spot Test,\(\tan^{-1}\left(\sin\frac{\pi}2\right)=\),"[tan, 1, left, sin, frac, pi, 2, right]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.0,0.0,0.04878,0.027087,0.027087


### Finding top 5 closest matches

In [42]:
data_df.sort_values(by = ['cosine_sim', 'jaccard_sim'], ascending = False).head(10)

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text,question_tokenized,test_q,test_q_tok,cosine_sim,jaccard_sim,manhattan_dis,eucledian_dis,minkowsk_dis
9406,JEE,MTH,11,Trigonometry,P029081,Homework,the difference between two acute angles of a r...,"[differ, two, acut, angl, right, angl, triangl...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.38236,0.0,0.042683,0.02997,0.02997
8162,JEE,MTH,11,Fundamentals of Mathematics,P025186,Homework,\(\log_ba^5\times\log_cb^3\times\log_ac^7=\),"[log_ba, 5, time, log_cb, 3, time, log_ac, 7]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.377964,0.0,0.006098,0.003914,0.003914
13488,JEE,PHY,11,Dynamics of Motion,P001326,Test,a body of mass \(5\times 10^{-3} kg\) is launc...,"[bodi, mass, 5, time, 10, 3, kg, launch, upon,...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.364405,0.0,0.115854,0.072353,0.072353
14009,JEE,PHY,11,Motion in Two Dimensions,P030487,In Class Test,a ball is moving with velocity \(5 m/s\) in ho...,"[ball, move, veloc, 5, horizont, direct, anoth...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.35583,0.0,0.091463,0.076318,0.076318
11503,JEE,MTH,11,Binomial Theorem,P035772,Homework,the remainder when\(x=5^{5^{5^{.^{.^{.^5}}}}}\...,"[remaind, x, 5, 5, 5, 5, 24, time, 5, divid, 24]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.348155,0.0,0.042683,0.059745,0.059745
13987,JEE,PHY,11,Motion in Two Dimensions,P030391,In Class Exercise,a ball is projected from ground at an angle of...,"[ball, project, ground, angl, 45, circ, cross,...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.340207,0.0,0.073171,0.043311,0.043311
13990,JEE,PHY,11,Motion in Two Dimensions,P030406,In Class Exercise,"if for a given angle of projection, the horizo...","[given, angl, project, horizont, rang, doubl, ...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.333333,0.2,0.018293,0.011193,0.011193
11279,JEE,MTH,11,Permutations and Combinations,P031593,Homework,the sum of all the divisors of \(2^5\times 3^4...,"[sum, divisor, 2, 5, time, 3, 4, time, 5, 2]",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.333333,0.0,0.036585,0.027087,0.027087
21031,JEE,PHY,9,Motion in a Plane,P040825,Test,the time taken in reaching the maximum height ...,"[time, taken, reach, maximum, height, ________...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.333333,0.0,0.018293,0.011193,0.011193
14135,JEE,PHY,11,Motion in Two Dimensions,P034337,Test,a body is projected at time \(t=0\) with veloc...,"[bodi, project, time, 0, veloc, u, angl, theta...",Calculate the time of flight of a ball launche...,"[calcul, time, flight, ball, launch, velocti, ...",0.324443,0.0,0.085366,0.070329,0.070329


In [28]:
data_df.sort_values(by = ['minkowsk_dis'], ascending = False).head()

Unnamed: 0,curriculum,subject,grade,chapter,problem_code,problem_type,question_text,question_tokenized,test_q,test_q_tok,cosine_sim,jaccard_sim,manhattan_dis,eucledian_dis,minkowsk_dis
786,JEE,MTH,12,Applications of Derivatives,P050284,Test,let \(f(x)\) and \(g(x)\) be real valued diffe...,"[let, f, x, g, x, real, valu, differenti, func...",Find the gravitational pull between Earth & Moon.,"[find, gravit, pull, earth, moon]",0.0,0.0,0.831325,1.0,1.0
537,JEE,MTH,12,Continuity and Differentiability,P053782,Test,question by appropriately matching the informa...,"[question, appropri, match, inform, given, thr...",Find the gravitational pull between Earth & Moon.,"[find, gravit, pull, earth, moon]",0.0,0.0,0.909639,0.943795,0.943795
538,JEE,MTH,12,Continuity and Differentiability,P053783,Test,question by appropriately matching the informa...,"[question, appropri, match, inform, given, thr...",Find the gravitational pull between Earth & Moon.,"[find, gravit, pull, earth, moon]",0.0,0.0,0.909639,0.943795,0.943795
536,JEE,MTH,12,Continuity and Differentiability,P053781,Test,question by appropriately matching the informa...,"[question, appropri, match, inform, given, thr...",Find the gravitational pull between Earth & Moon.,"[find, gravit, pull, earth, moon]",0.0,0.0,0.909639,0.943795,0.943795
11788,JEE,MTH,11,Limits,P031445,Test,statement 1 : \(\lim\limits_{x\to\infty}\left(...,"[statement, 1, lim, limits_, x, infti, left, f...",Find the gravitational pull between Earth & Moon.,"[find, gravit, pull, earth, moon]",0.0,0.0,0.740964,0.897621,0.897621


#### Only cosine similarity seems meaningful from all the similarity metrics
#### We also need to add tf-idf (only tf?) and n-grams to this model