In [24]:
import pandas as pd
import tqdm.auto
import numpy as np
import glob
import concurrent.futures
import multiprocessing
import requests
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk
import pprint

In [None]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

In [2]:
#ELASTIC_HOST="np-database.c.np-training.internal"
ELASTIC_HOST="localhost"
ELASTIC_INDEX="stackoverflow"
ELASTIC_PORT=9200

In [101]:
def create_index(client,index:str, num_shards=3):
    """Creates an index in Elasticsearch if one isn't already there."""
    
    client.indices.delete(index)
    
    client.indices.create(
        index=index,
        body={
            "settings": {"number_of_shards": num_shards},
            # "mappings": {
            #     "properties": {
            #         "name": {"type": "text"},
            #         "borough": {"type": "keyword"},
            #         "cuisine": {"type": "keyword"},
            #         "grade": {"type": "keyword"},
            #         "location": {"type": "geo_point"},
            #     }
            # },
        },
        ignore=400,
    )


def generate_docs(df:pd.DataFrame):
    """Reads the file through csv.DictReader() and for each row
    yields a single document. This function is passed into the bulk()
    helper to create many documents in sequence.
    """
    
    
    for index, row in df.iterrows():
        doc = {**row} 
        
        doc['_id'] = doc["Id"]
        
        for k in list(doc.keys()):
            #print (k , doc[k])
            if type(doc[k]) !=list and (doc[k] ==None or  ( pd.isna( doc[k] )  )) :
                del doc[k]
                
        del doc['Id']
        yield doc
        


def fetch_results(client:Elasticsearch, query:str,  num_hits=5, fields = ["Title", "QuestionBody"]):
    
    resp = client.search(
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": fields,
                }
            }
        },
        size = num_hits
    )
    
    return resp
    

        

In [102]:
type(client)

elasticsearch.client.Elasticsearch

In [None]:
# requests.put(f"http://{ELASTIC_HOST}:{ELASTIC_PORT}/_template/index_defaults", json = 
#     {
#       "index_patterns": "*", 
#       "settings": {
#         "number_of_shards": 5
#       }
#     } 
# ).json()


In [None]:
# requests.delete(f"http://{ELASTIC_HOST}:{ELASTIC_PORT}/{ELASTIC_INDEX}").json()

In [None]:
# requests.delete(f"http://{ELASTIC_HOST}:{ELASTIC_PORT}/label").json()

In [5]:
!gsutil ls gs://np-training-tmp/stackoverflow/final/

gs://np-training-tmp/stackoverflow/final/posts.parquet
gs://np-training-tmp/stackoverflow/final/related_posts.parquet


In [6]:
df = pd.read_parquet("gs://np-training-tmp/stackoverflow/final/posts.parquet")
df['Tags']  = df['Tags'].apply(lambda x: x.tolist())

In [7]:
df.head()

Unnamed: 0,Id,AcceptedAnswerId,Title,QuestionBody,Tags,ViewCount,AnswerCount,CommentCount,Score,CreationDate,AnswerId,AcceptedAnswerBody
0,33760194,,Python How to burning discs with the monitorin...,I'm writing the programm on Python with module...,"[python, event-handling, progressmonitor]",491,0,2,0,2015-11-17 15:02:09.103,,
1,15020895,,Python int-byte efficient data structure,i am currently storing key-values of type int-...,"[python, data-structures]",155,0,3,1,2013-02-22 09:33:26.360,,
2,47234657,,converting word into other word keeping the or...,"def translate(string, translations):\n\n[CODE]...","[python, python-3.x]",48,2,1,-1,2017-11-11 05:23:34.343,,
3,37310210,,Camera Calibration with OpenCV - How to adjust...,I am working on a camera calibration program u...,"[python, python-2.7, opencv, camera, camera-ca...",8164,2,3,3,2016-05-18 21:14:34.110,,
4,70675292,,Python Same Period Last Year in Pandas with Gr...,I have following DataFrame:\nimport pandas as ...,"[python, pandas, group-by, offset, forecasting]",70,1,0,0,2022-01-12 01:19:53.640,,


In [8]:
df.iloc[0].to_dict()

{'Id': 33760194,
 'AcceptedAnswerId': nan,
 'Title': 'Python How to burning discs with the monitoring of progress with IMAPI2',
 'QuestionBody': "I'm writing the programm on Python with module for burning CD/DVD with IMAPI2.\nI used these examples: codeplex and comtypes doc.\n\n[CODE]\n\nCreate a custom class with method named as event of 'dataWriter':\n\n[CODE]\n\nAnd run this script:\n\n[CODE]\n\nI can write CD/DVD, but can't monitor a progress and I can't understand, why.\n\nDoes anyone have any idea?\n\nThanks a lot.\n",
 'Tags': ['python', 'event-handling', 'progressmonitor'],
 'ViewCount': 491,
 'AnswerCount': 0,
 'CommentCount': 2,
 'Score': 0,
 'CreationDate': Timestamp('2015-11-17 15:02:09.103000'),
 'AnswerId': nan,
 'AcceptedAnswerBody': None}

In [None]:
?Elasticsearch

In [9]:
client = Elasticsearch(
    [f'{ELASTIC_HOST}:{ELASTIC_PORT}']
    
)

In [10]:
create_index(client, index= ELASTIC_INDEX)

In [11]:
requests.get(f"http://{ELASTIC_HOST}:{ELASTIC_PORT}/_all/_settings").json()

{'stackoverflow': {'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}},
    'number_of_shards': '3',
    'provided_name': 'stackoverflow',
    'creation_date': '1666543930187',
    'number_of_replicas': '1',
    'uuid': '5baqO6eGRMipj_EbinW2LQ',
    'version': {'created': '8040399'}}}}}

In [18]:
len(df)

2661445

In [19]:
df_subset = df.head(5_000_000)
number_of_docs = len(df_subset)

In [20]:
progress = tqdm.auto.tqdm(unit="docs", total=number_of_docs )
successes = 0


for ok, action in streaming_bulk(
        client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
    ):
    progress.update(1)
    successes += ok
    
print("Indexed %d/%d documents" % (successes, number_of_docs))

  0%|          | 0/2661445 [00:00<?, ?docs/s]

Indexed 2661445/2661445 documents


In [None]:
with tqdm.auto.tqdm(total=number_of_docs , unit="docs" ) as pbar:
    successes = 0


    for ok, action in streaming_bulk(
            client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
        ):
        pbar.update(1)
        successes += ok


  0%|          | 0/2661445 [00:00<?, ?docs/s]

In [None]:
main()

# Evaluate

In [None]:
?client.get

In [None]:
client.get(ELASTIC_INDEX, 71026393)

In [35]:
# https://stackoverflow.com/questions/34147471/elasticsearch-how-to-search-for-a-value-in-any-field-across-all-types-in-one

resp = client.search(
    body={
        "query": {
            "multi_match": {
                "query": "pandas memmory issue",
                # "fields": ["Title", "QuestionBody"],
            }
        }
    },
    size=5,
    explain=True
)


In [164]:
resp;

In [None]:
resp

In [37]:
pdf_related = pd.read_parquet("gs://np-training-tmp/stackoverflow/final/related_posts.parquet")

In [38]:
pdf_related.head()

Unnamed: 0,PostId,PostTitle,RelatedPostIds,RelatedPostTitles,num_candidates
0,57348742,How do I simulate a Scrollbar in tkInter Canvas,"[57348742, 68340045]",[How do I simulate a Scrollbar in tkInter Canv...,2
1,3494593,Shading a kernel density plot between two points.,"[3494593, 14863744, 14094644, 16504452, 488531...",[Shading a kernel density plot between two poi...,16
2,37949409,Dictionary in a numpy array?,"[37949409, 47689224, 61517741]","[Dictionary in a numpy array?, How to access t...",3
3,51519086,How to remove tkinter - - - - line's when crea...,"[51519086, 55088055]",[How to remove tkinter - - - - line's when cre...,2
4,63107594,How to deal with multi-level column names down...,"[63107594, 63107603, 62966295, 68674235, 63124...",[How to deal with multi-level column names dow...,6


In [40]:
len (pdf_related)

33248

In [165]:
resp['hits']['hits'][0];

In [232]:
def format_resp(resp, row):
    payload = []
    query = row['PostTitle']
    for hit in resp['hits']['hits']:
        doc_id = int(hit['_id'])
        
        r = {
             'query': query
             , 'query_id' : row['PostId']
             ,'doc_id' : doc_id
             , 'is_relevant' : doc_id in row['RelatedPostIds']
             ,'score' : hit['_score']
             ,'doc_title' : hit['_source']['Title']


        }
        payload.append(r)    
    return payload

def fetch_as_relevancy_eval(row,num_hits=10):
    client = Elasticsearch(
    [f'{ELASTIC_HOST}:{ELASTIC_PORT}']
    
    )
    
    
    query = row['PostTitle']
    resp = fetch_results(client, query, num_hits=num_hits)
    payload = format_resp(resp, row)
    
    return pd.DataFrame(payload)
    

def evaluate_relevancy_hits(df,num_hits=10):
    
    payload = []
    for index, row in df.iterrows():

        payload_query = fetch_as_relevancy_eval(row)
        
        payload.extend(payload_query.to_dict(orient='records') )

    
    #return pd.DataFrame.from_records(payload)
    return pd.DataFrame(payload)
    



def evaluate_relevancy_hits2(df,num_hits=20):
    
    
    res = df.parallel_apply(fetch_as_relevancy_eval,num_hits=num_hits, axis = 1)

    return res
    

In [229]:
fetch_as_relevancy_eval(pdf_related.iloc[0].to_dict() )

Unnamed: 0,query,query_id,doc_id,is_relevant,score,doc_title
0,How do I simulate a Scrollbar in tkInter Canvas,57348742,57348742,True,38.083534,How do I simulate a Scrollbar in tkInter Canvas
1,How do I simulate a Scrollbar in tkInter Canvas,57348742,64181265,False,30.856358,Implement scrollbar in Canvas
2,How do I simulate a Scrollbar in tkInter Canvas,57348742,72430615,False,30.753239,keeping a horizontal scrollbar to a canvas wit...
3,How do I simulate a Scrollbar in tkInter Canvas,57348742,70458305,False,30.542326,How can I add Unscrollable Image to a frame/Ca...
4,How do I simulate a Scrollbar in tkInter Canvas,57348742,69547008,False,30.219063,Create resizable Tkinter frame inside of scrol...
5,How do I simulate a Scrollbar in tkInter Canvas,57348742,48437710,False,30.17255,"How to remove canvas, separator and scrollbar?"
6,How do I simulate a Scrollbar in tkInter Canvas,57348742,28631312,False,29.988377,Python Tkinter Scrollbar Shaky Scrolling
7,How do I simulate a Scrollbar in tkInter Canvas,57348742,61823535,False,29.85104,Display tkinter fonts
8,How do I simulate a Scrollbar in tkInter Canvas,57348742,64046744,False,29.427263,Using tk.Scrollbar to update images in tk.canvas
9,How do I simulate a Scrollbar in tkInter Canvas,57348742,23180982,False,28.94214,python tkinter canvas size using scrollbar


In [230]:
evaluate_relevancy_hits(pdf_related.iloc[0:2])

Unnamed: 0,query,query_id,doc_id,is_relevant,score,doc_title
0,How do I simulate a Scrollbar in tkInter Canvas,57348742,57348742,True,38.083534,How do I simulate a Scrollbar in tkInter Canvas
1,How do I simulate a Scrollbar in tkInter Canvas,57348742,64181265,False,30.856358,Implement scrollbar in Canvas
2,How do I simulate a Scrollbar in tkInter Canvas,57348742,72430615,False,30.753239,keeping a horizontal scrollbar to a canvas wit...
3,How do I simulate a Scrollbar in tkInter Canvas,57348742,70458305,False,30.542326,How can I add Unscrollable Image to a frame/Ca...
4,How do I simulate a Scrollbar in tkInter Canvas,57348742,69547008,False,30.219063,Create resizable Tkinter frame inside of scrol...
5,How do I simulate a Scrollbar in tkInter Canvas,57348742,48437710,False,30.17255,"How to remove canvas, separator and scrollbar?"
6,How do I simulate a Scrollbar in tkInter Canvas,57348742,28631312,False,29.988377,Python Tkinter Scrollbar Shaky Scrolling
7,How do I simulate a Scrollbar in tkInter Canvas,57348742,61823535,False,29.85104,Display tkinter fonts
8,How do I simulate a Scrollbar in tkInter Canvas,57348742,64046744,False,29.427263,Using tk.Scrollbar to update images in tk.canvas
9,How do I simulate a Scrollbar in tkInter Canvas,57348742,23180982,False,28.94214,python tkinter canvas size using scrollbar


In [233]:
r = evaluate_relevancy_hits2(pdf_related.head(1000) )

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=125), Label(value='0 / 125'))), HB…

In [234]:
df_res  = pd.concat(list(r) ,ignore_index = True)

In [235]:
df_res

Unnamed: 0,query,query_id,doc_id,is_relevant,score,doc_title
0,How do I simulate a Scrollbar in tkInter Canvas,57348742,57348742,True,38.083534,How do I simulate a Scrollbar in tkInter Canvas
1,How do I simulate a Scrollbar in tkInter Canvas,57348742,64181265,False,30.856358,Implement scrollbar in Canvas
2,How do I simulate a Scrollbar in tkInter Canvas,57348742,72430615,False,30.753239,keeping a horizontal scrollbar to a canvas wit...
3,How do I simulate a Scrollbar in tkInter Canvas,57348742,70458305,False,30.542326,How can I add Unscrollable Image to a frame/Ca...
4,How do I simulate a Scrollbar in tkInter Canvas,57348742,69547008,False,30.219063,Create resizable Tkinter frame inside of scrol...
...,...,...,...,...,...,...
19995,sys.exit doesn't work as expected after try:,42570052,211100,False,22.290020,Python's __import__ doesn't work as expected
19996,sys.exit doesn't work as expected after try:,42570052,43604615,False,22.290020,keras fit_generator doesn't work as expected
19997,sys.exit doesn't work as expected after try:,42570052,44541711,False,22.290020,python `else` doesn't work as expected
19998,sys.exit doesn't work as expected after try:,42570052,48871211,False,22.275202,Regex sometimes doesn't work as expected


In [243]:
query_id = 42570052

In [244]:
pdf_related [ pdf_related['PostId'] == query_id ].iloc[0].to_dict()

{'PostId': 42570052,
 'PostTitle': "sys.exit doesn't work as expected after try:",
 'RelatedPostIds': array([42570052, 54376877]),
 'RelatedPostTitles': array(["sys.exit doesn't work as expected after try:",
        'how to stop a program in try/except'], dtype=object),
 'num_candidates': 2}

In [245]:
df_res[ df_res.query_id==query_id]

Unnamed: 0,query,query_id,doc_id,is_relevant,score,doc_title
19980,sys.exit doesn't work as expected after try:,42570052,42570052,True,43.342487,sys.exit doesn't work as expected after try:
19981,sys.exit doesn't work as expected after try:,42570052,14180179,False,23.485918,`eventlet.spawn` doesn't work as expected
19982,sys.exit doesn't work as expected after try:,42570052,42082913,False,23.485918,tf.reshape doesn't work as expected
19983,sys.exit doesn't work as expected after try:,42570052,18854067,False,23.485918,Program doesn't work as expected
19984,sys.exit doesn't work as expected after try:,42570052,44356589,False,23.470901,misc.imshow() doesn't work as expected
19985,sys.exit doesn't work as expected after try:,42570052,30603190,False,23.470901,.dropna doesn't work as expected
19986,sys.exit doesn't work as expected after try:,42570052,11275606,False,23.470901,Celerybeat doesn't work as expected
19987,sys.exit doesn't work as expected after try:,42570052,54095003,False,23.470901,pyautogui.click() doesn't work as expected
19988,sys.exit doesn't work as expected after try:,42570052,64268972,False,23.470901,tf.random.normal() doesn't work as expected
19989,sys.exit doesn't work as expected after try:,42570052,16645761,False,23.470901,subprocess.call doesn't work as expected


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [248]:
z = df_res[ df_res.query_id==query_id]['is_relevant']
z

19980     True
19981    False
19982    False
19983    False
19984    False
19985    False
19986    False
19987    False
19988    False
19989    False
19990    False
19991    False
19992    False
19993    False
19994    False
19995    False
19996    False
19997    False
19998    False
19999    False
Name: is_relevant, dtype: bool

In [315]:
def metrics(result):
    
    result = list(result) 
    
    mrr = 0
    
    if True in result:
        first_index = list(z).index(True) 
        mrr = 1 /  (first_index + 1)
    
    res=  {
     "p@1" :  sum(result[:1])  
    , "p@5" :  sum(result[:5]) / 5 
    , "p@10" :  sum(result[:10])  / 10 
    , "mrr" :  mrr

        
        
    }
    return pd.Series(res)

In [316]:
metrics(z)

p@1     1.0
p@5     0.2
p@10    0.1
mrr     1.0
dtype: float64

In [317]:
#?df_res.groupby

In [319]:
df_agg_res  = df_res.groupby(['query_id'], as_index=False).apply (lambda x: metrics(x['is_relevant']))



In [None]:
df_agg_res.drop(columns='query_id').agg(np.mean)

p@1     0.9310
p@5     0.2312
p@10    0.1259
mrr     0.9980
dtype: float64