## Importing modules

In [1]:
import json
# for loading the directory of the file
from os import listdir
import gensim
import logging
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords

## Data 

In [2]:
df_squad_train = pd.read_json('data/squad_train_doc.json')
df_squad_dev = pd.read_json('data/squad_dev_doc.json')
df_dev = pd.read_json('data/dev-v1.1.json')
df_train = pd.read_json('data/train-v1.1.json')

In [3]:
df_squad_train.rename(columns={'passages':'documents'}, inplace=True)
df_squad_train

Unnamed: 0,documents,title
0,"[{'context': 'Architecturally, the school has ...",University_of_Notre_Dame
1,[{'context': 'Beyoncé Giselle Knowles-Carter (...,Beyoncé
2,[{'context': 'Montana i/mɒnˈtænə/ is a state i...,Montana
3,"[{'context': 'The phrase ""in whole or in part""...",Genocide
4,[{'context': 'The emergence of resistance of b...,Antibiotics
5,[{'context': 'Frédéric François Chopin (/ˈʃoʊp...,Frédéric_Chopin
6,[{'context': 'The exact nature of relations be...,Sino-Tibetan_relations_during_the_Ming_dynasty
7,[{'context': 'The iPod is a line of portable m...,IPod
8,[{'context': 'The Legend of Zelda: Twilight Pr...,The_Legend_of_Zelda:_Twilight_Princess
9,[{'context': 'Spectre (2015) is the twenty-fou...,Spectre_(2015_film)


In [4]:
df_squad_train_new = df_squad_train.groupby('title').documents.apply(lambda x: pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis = 1)
df_squad_train_new.rename(columns={'questions':'question set'}, inplace=True)
df_squad_train_new

Unnamed: 0,title,context,question set
0,2008_Sichuan_earthquake,The 2008 Sichuan earthquake or the Great Sichu...,[In what year did the earthquake in Sichuan oc...
1,2008_Sichuan_earthquake,It is also known as the Wenchuan earthquake (C...,"[What was the focal depth of the quake?, Where..."
2,2008_Sichuan_earthquake,"Official figures (as of July 21, 2008 12:00 CS...","[How many people died in Sichuan Province?, Ho..."
3,2008_Sichuan_earthquake,The earthquake had a magnitude of 8.0 Ms and 7...,"[What percentage of buildings were destroyed?,..."
4,2008_Sichuan_earthquake,According to a study by the China Earthquake A...,"[How deep was the focus of the earthquake?, Wh..."
5,2008_Sichuan_earthquake,Malaysia-based Yazhou Zhoukan conducted an int...,"[What was the predicted time of the quake?, Wh..."
6,2008_Sichuan_earthquake,In a United States Geological Survey (USGS) st...,[What did the United States Geological Survey ...
7,2008_Sichuan_earthquake,Japanese seismologist Yuji Yagi at the Univers...,"[Why did the seismic waves travel so far?, How..."
8,2008_Sichuan_earthquake,"Between 64 and 104 major aftershocks, ranging ...",[When did the latest magnitude 6 aftershock oc...
9,2008_Sichuan_earthquake,"(The Ms 6.1 earthquake on August 30, 2008 in s...","[Where did this earthquake occur?, Where shoul..."


In [5]:
dffinal = pd.DataFrame(df_squad_train_new.groupby('title').context.agg(sum))
dffinal = dffinal.reset_index()
dffinal

Unnamed: 0,title,context
0,2008_Sichuan_earthquake,The 2008 Sichuan earthquake or the Great Sichu...
1,2008_Summer_Olympics_torch_relay,The 2008 Summer Olympics torch relay was run f...
2,51st_state,"The phrase ""51st state"" can be used in a posit..."
3,ASCII,"Originally based on the English alphabet, ASCI..."
4,A_cappella,"A cappella [a kapˈpɛlla] (Italian for ""in the ..."
5,Adolescence,A thorough understanding of adolescence in soc...
6,Adult_contemporary_music,Adult contemporary music (AC) is a style of mu...
7,Affirmative_action_in_the_United_States,Affirmative action in the United States tends ...
8,Age_of_Enlightenment,French historians traditionally place the Enli...
9,Aircraft_carrier,A fleet carrier is intended to operate with th...


In [6]:
### Train Data
train_1 = [nltk.word_tokenize(dffinal['context'][i]) for i in range(len(dffinal['context']))]

In [7]:
stop_words = stopwords.words('english')

In [8]:
### Removing Stop Words
train_wo_sw= []
for i in range(len(train_1)):
    for w in train_1[i]:
        word_removed = []
        if w not in stop_words:
            word_removed.append(w)
        train_wo_sw.append(word_removed)
        

In [9]:
### Model
BM_25 = gensim.summarization.bm25.BM25(train_1)
BM_25

<gensim.summarization.bm25.BM25 at 0x7f9eb1621a20>

In [10]:
query = "When is Childern's day"


In [11]:
def top_n_score(query, dffinal = dffinal,model = BM_25):
    query = query.split()
    
    scores = BM_25.get_scores(query,1)
    score_len = len(scores)
    df = pd.DataFrame(data = dffinal['title'])
    df['score'] = scores
    df = df.sort_values(by=['score'],ascending=False).reset_index()
    df.drop(['index'],inplace=True,axis=1)
    df= df.reset_index()
    df.columns = ['position','title','score']
    df['position']=df['position']+1
    z = dffinal.merge(df,how='outer',on=['title'])
    return z 

In [12]:
temp = top_n_score(query)

In [13]:
temp

Unnamed: 0,title,context,position,score
0,2008_Sichuan_earthquake,The 2008 Sichuan earthquake or the Great Sichu...,162,1.170319
1,2008_Summer_Olympics_torch_relay,The 2008 Summer Olympics torch relay was run f...,52,1.638672
2,51st_state,"The phrase ""51st state"" can be used in a posit...",292,1.041718
3,ASCII,"Originally based on the English alphabet, ASCI...",130,1.461931
4,A_cappella,"A cappella [a kapˈpɛlla] (Italian for ""in the ...",205,1.124150
5,Adolescence,A thorough understanding of adolescence in soc...,10,1.736421
6,Adult_contemporary_music,Adult contemporary music (AC) is a style of mu...,65,1.613765
7,Affirmative_action_in_the_United_States,Affirmative action in the United States tends ...,120,1.473989
8,Age_of_Enlightenment,French historians traditionally place the Enli...,276,1.044530
9,Aircraft_carrier,A fleet carrier is intended to operate with th...,245,1.047128


In [14]:
z = pd.DataFrame(df_squad_train_new.groupby('title')['question set'].agg('sum')).reset_index()
z.columns = ['title','questions']
z

Unnamed: 0,title,questions
0,2008_Sichuan_earthquake,[In what year did the earthquake in Sichuan oc...
1,2008_Summer_Olympics_torch_relay,[How many days did people carry the Olympic to...
2,51st_state,"[What is a positive connotation of the label ""..."
3,ASCII,[How many characters are printable characters?...
4,A_cappella,[A cappella was originally used to tell the di...
5,Adolescence,"[What is the cultural purpose of adolescence?,..."
6,Adult_contemporary_music,"[What does the acronym AC stand for?, What was..."
7,Affirmative_action_in_the_United_States,[Affirmative action does not only attempt to r...
8,Age_of_Enlightenment,[Recent historians begin the period of Enlight...
9,Aircraft_carrier,"[What were most escort carriers built from?, W..."


In [15]:
def top_n_title(data_frame, counts):
    titles = []
    for count in range(1,counts+1):
        k = data_frame[data_frame.position==count]['title']
        titles.append(k.values[0])
    return titles
        

## Overall accuracy window_size = 1

In [16]:
def overall_accuracy(window_size):
    count = 0
    total_count = 0
    for index, row in z.iterrows():
        temp_title = row['title']
        #print(temp_title)
        top_count_title_list = []
        for question in row['questions']:
            temporary = top_n_score(question,dffinal = dffinal,model = BM_25)
            top_count_title_list = top_n_title(temporary,window_size)
            total_count += 1
            #print(top_count_title_list)
            if temp_title in top_count_title_list:
                count +=1
    return count,total_count

In [24]:
num_1,den_1 = overall_accuracy(1)

In [25]:
acc_1 = num_1/den_1

In [26]:
acc_1

0.5154461130236867

## Overall Accuracy Window_Size = 10

In [19]:
num_2,den_2 = overall_accuracy(10)

In [20]:
acc_2 = num_2/den_2

In [21]:
acc_2

0.7822218155616301

## Overall Accuracy Window_Size = 20

In [22]:
num_3,den_3 = overall_accuracy(20)
acc_3 = num_3/den_3
acc_3

0.83543970811936

## Overall Accuracy Window_Size = 25

In [23]:
num_4,den_4 = overall_accuracy(25)
acc_4 = num_4/den_4
acc_4

0.8512918463280454

In [None]:
num_5,den_5 = overall_accuracy(100)
acc_5 = num_5/den_5
acc_5