In [10]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Load Squad data

In [4]:
# based on: https://www.kaggle.com/code/sanjay11100/squad-stanford-q-a-json-to-pandas-dataframe
def squad_json_to_dataframe(file_path, record_path=['data','paragraphs','qas','answers']):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    """
    file = json.loads(open(file_path).read())
    # parsing different level's in the json file
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file,record_path[:-2])
    # combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    data = m[['id','question','context','answers']].set_index('id').reset_index()
    data['c_id'] = data['context'].factorize()[0]
    return data

I downloaded locally the dataset as it is not heavy. Paths: 
'/Users/ezagury/Downloads/squad1.1/train-v1.1.json' 
'/Users/ezagury/Downloads/squad1.1/dev-v1.1.json

In [5]:
## Load dataset
squad = squad_json_to_dataframe('/Users/ezagury/Downloads/squad1.1/train-v1.1.json')

## get a list of contexts and associated ids 
documents = squad[['context', 'c_id']].drop_duplicates().reset_index(drop=True)

## Retrieve context using TF-IDF + Knn

### Model definition

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# defining the TF-IDF
tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': True,
    'max_df': 0.9,
    'max_features': 10_000
}
# defining the number of documents to retrieve
retriever_configs = {
    'n_neighbors': 1,
    'metric': 'cosine'
}


In [68]:
class tf_idf_retriever:
    def __init__(self):
    # defining our pipeline
        self.embedding = TfidfVectorizer(**tfidf_configs)
        self.retriever = NearestNeighbors(**retriever_configs)

    def fit(self, contexts):
        ''' Fit context embeddings to context ids
            Input : dict/dataframe in format {'context','c_id'}
        '''
        # let's train the model to retrieve the document id 'c_id'
        X = self.embedding.fit_transform(contexts['context'])
        self.retriever.fit(X, contexts['c_id'])

    def predict(self, question):
        ''' Predict predict k best contexts for each question
            Input : list/Series of str objects (questions)
            Output : array of context ids of shape len(question)*k 
        '''
        X = self.embedding.transform(question)
        y_pred = self.retriever.kneighbors(X, return_distance=False)
        return y_pred
    
    def vectorized(self, text): 
        ''' Return vectorized version of text, using the model vectorizer'''
        vector = self.embedding.transform([text])
        return self.embedding.inverse_transform(vector)


### Train/predict

In [62]:
model = tf_idf_retriever()
model.fit(documents)

In [63]:
whole = model.predict(squad['question'])

Let's make prediction by splitting dataset

In [21]:
X_train, X_test, y_train, y_test = train_test_split(squad[['context', 'c_id']], squad['question'], test_size=0.3, random_state=0)

In [23]:
## Fit on train set
model.fit(X_train)

## Predict over whole set
train = model.predict(y_train)
test = model.predict(y_test)

## Get real context ids
y_train = X_train['c_id']
y_test = X_test['c_id']

### Get precision

In [None]:
def precision(y_pred: np.array, y_true: pd.Series) -> float:
    ''' Input : vector of true context ids, vector of context id predictions
        Output : precision = number of right predictions / Total predictions
    '''
    if len(y_pred) > 0:
        return sum(y_pred.squeeze() == y_true.values)/len(y_true)
    else: return 0

def top_accuracy(y_true, y_pred) -> float:  ## Utile si on prédit plusieurs contexts pr 1 question
    right, count = 0, 0
    for y_t in y_true:
        count += 1
        if y_t in y_pred:
            right += 1
    return right / count if count > 0 else 0

In [59]:
precision( train, y_train)

4.8924476915800974e-05

In [60]:
precision( test, y_test)

0.0

In [64]:
precision(whole, squad.c_id)

0.43215105195264786

###  Appreciate vectorizer

In [72]:
model = tf_idf_retriever()
model.fit(documents)
model.vectorized('Where is the headquarters of the Congregation of the Holy Cross?')

[array(['holy', 'headquarters', 'cross', 'congregation'], dtype='<U18')]

## Using Word2Vect + knn

In [73]:
from gensim.parsing.preprocessing import preprocess_string

# create a corpus of tokens
corpus = documents['context'].tolist()
corpus = [preprocess_string(t) for t in corpus]

In [74]:
from gensim.models import Word2Vec
import gensim.downloader

# you can download a pretrained Word2Vec
# - or you can train your own model

# download a model
# 'glove-wiki-gigaword-300' (376.1 MB)
# 'word2vec-ruscorpora-300' (198.8 MB)
# 'word2vec-google-news-300' (1.6 GB)
vectorizer = gensim.downloader.load('word2vec-ruscorpora-300')

# train your own model
vectorizer = Word2Vec(sentences=corpus, vector_size=300, window=5, min_count=1, workers=4).wv

[=====---------------------------------------------] 11.8% 23.4/198.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



