In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [28]:
q = {
    "question": "What is the capital of France?",
    "subject": "Geography",
    "question_type": "Multiple Choice",
    "sentence_length": 6,
    "word_frequency": 2.5,
    "vocabulary_complexity": 0.7,
    "syntactic_complexity": 0.6,
    "ambiguity_level": 1,
    "domain_knowledge_required": 0.8,
    "context_relevance": 0.9,
    "formatting_complexity": 0.3,
    "verifiability": 0.7,
    "difficulty": 2
},

Geography = 1
Art = 2
Mathematics = 3
Chemistry =4

In [29]:
# Step 1: Load and Preprocess the Dataset
df = pd.read_json('questions.json')

In [30]:
df

Unnamed: 0,question,subject,question_type,sentence_length,word_frequency,vocabulary_complexity,syntactic_complexity,ambiguity_level,domain_knowledge_required,context_relevance,formatting_complexity,verifiability,difficulty
0,What is the capital of France?,1,Multiple Choice,6,2.5,0.7,0.6,1.0,0.8,0.9,0.3,0.7,2
1,Who painted the Mona Lisa?,2,Open-ended,5,3.2,0.8,0.7,0.5,0.9,0.8,0.4,0.9,3
2,Solve for x: 2x + 5 = 15,3,Equation,9,1.9,0.6,0.8,0.2,0.6,0.7,0.2,0.6,4
3,What is the chemical symbol for gold?,4,Fill in the blank,7,2.7,0.9,0.5,0.3,0.7,0.6,0.1,0.8,2
4,What year was the United Nations founded?,4,Open-ended,8,2.2,0.7,0.6,0.4,0.5,0.8,0.3,0.9,1


In [31]:
df.keys()

Index(['question', 'subject', 'question_type', 'sentence_length',
       'word_frequency', 'vocabulary_complexity', 'syntactic_complexity',
       'ambiguity_level', 'domain_knowledge_required', 'context_relevance',
       'formatting_complexity', 'verifiability', 'difficulty'],
      dtype='object')

In [32]:
combine_question_difficulty = df.dropna(axis = 0, subset = ['question'])

question_difficultyCount = (combine_question_difficulty.
     groupby(by = ['question'])['difficulty'].
     count().
     reset_index().
     rename(columns = {'difficulty': 'totalDifficultyCount'})
     [['question', 'totalDifficultyCount']]
    )
question_difficultyCount.head()

Unnamed: 0,question,totalDifficultyCount
0,Solve for x: 2x + 5 = 15,1
1,What is the capital of France?,1
2,What is the chemical symbol for gold?,1
3,What year was the United Nations founded?,1
4,Who painted the Mona Lisa?,1


In [33]:
difficulty_with_totalDifficultyCount = combine_question_difficulty.merge(question_difficultyCount, left_on = 'question', right_on = 'question', how = 'left')
difficulty_with_totalDifficultyCount.head()

Unnamed: 0,question,subject,question_type,sentence_length,word_frequency,vocabulary_complexity,syntactic_complexity,ambiguity_level,domain_knowledge_required,context_relevance,formatting_complexity,verifiability,difficulty,totalDifficultyCount
0,What is the capital of France?,1,Multiple Choice,6,2.5,0.7,0.6,1.0,0.8,0.9,0.3,0.7,2,1
1,Who painted the Mona Lisa?,2,Open-ended,5,3.2,0.8,0.7,0.5,0.9,0.8,0.4,0.9,3,1
2,Solve for x: 2x + 5 = 15,3,Equation,9,1.9,0.6,0.8,0.2,0.6,0.7,0.2,0.6,4,1
3,What is the chemical symbol for gold?,4,Fill in the blank,7,2.7,0.9,0.5,0.3,0.7,0.6,0.1,0.8,2,1
4,What year was the United Nations founded?,4,Open-ended,8,2.2,0.7,0.6,0.4,0.5,0.8,0.3,0.9,1,1


In [34]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(question_difficultyCount['totalDifficultyCount'].describe())

count   5.000
mean    1.000
std     0.000
min     1.000
25%     1.000
50%     1.000
75%     1.000
max     1.000
Name: totalDifficultyCount, dtype: float64


In [35]:
popularity_threshold = 1
qDifficulty= difficulty_with_totalDifficultyCount.query('totalDifficultyCount >= @popularity_threshold')
qDifficulty.head()

Unnamed: 0,question,subject,question_type,sentence_length,word_frequency,vocabulary_complexity,syntactic_complexity,ambiguity_level,domain_knowledge_required,context_relevance,formatting_complexity,verifiability,difficulty,totalDifficultyCount
0,What is the capital of France?,1,Multiple Choice,6,2.5,0.7,0.6,1.0,0.8,0.9,0.3,0.7,2,1
1,Who painted the Mona Lisa?,2,Open-ended,5,3.2,0.8,0.7,0.5,0.9,0.8,0.4,0.9,3,1
2,Solve for x: 2x + 5 = 15,3,Equation,9,1.9,0.6,0.8,0.2,0.6,0.7,0.2,0.6,4,1
3,What is the chemical symbol for gold?,4,Fill in the blank,7,2.7,0.9,0.5,0.3,0.7,0.6,0.1,0.8,2,1
4,What year was the United Nations founded?,4,Open-ended,8,2.2,0.7,0.6,0.4,0.5,0.8,0.3,0.9,1,1


In [36]:
qDifficulty.shape

(5, 14)

In [38]:
# First lets create a Pivot matrix

question_features_df=qDifficulty.pivot_table(index='question',columns='subject',values='difficulty').fillna(0)
question_features_df.head()

subject,1,2,3,4
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Solve for x: 2x + 5 = 15,0.0,0.0,4.0,0.0
What is the capital of France?,2.0,0.0,0.0,0.0
What is the chemical symbol for gold?,0.0,0.0,0.0,2.0
What year was the United Nations founded?,0.0,0.0,0.0,1.0
Who painted the Mona Lisa?,0.0,3.0,0.0,0.0


In [39]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(question_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

In [46]:
query_index = np.random.choice(question_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(question_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors =5)

1


In [47]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(question_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, question_features_df.index[indices.flatten()[i]], distances.flatten()[i]))


Recommendations for What is the capital of France?:

1: Solve for x: 2x + 5 = 15, with distance of 1.0:
2: What is the chemical symbol for gold?, with distance of 1.0:
3: What year was the United Nations founded?, with distance of 1.0:
4: Who painted the Mona Lisa?, with distance of 1.0:
