# Elastic Search implementation

## Goals
* UNDERSTAND HOW ELASTIC-SEARCH WORKS
* Utilize the following information: Student Names, Mentor Names, ReadMe summarizations, Report Summarizations, Raw Readmes, Raw Reports, Years, Project Title, Domain
* Keyword Search - Student Names, Mentor Names, Domain, Project Title
* Semantic Search - ReadMe Summarization, Report Summarization, Domain, Project Title
* Fuzzy Match / autocorrect
* Filtering

CSVs to use

* overall_data.csv - Year, Domain, Project Title
* mentors.csv - Mentor
* students.csv - Students
* github.csv - readme raw, readme summarized
* report_contents.csv - raw and processed text



Things to note - Don't forget about language breakdown


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers, exceptions
import pickle
# from sentence_transformers import SentenceTransformer

In [None]:
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
# print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

In [None]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
ovr_DF = pd.read_csv("../../data/overall_data.csv", index_col= 0)
ovr_DF.head(3)

In [None]:
mentor_DF = pd.read_csv("../../data/mentors.csv")
mentor_DF.head(3)

In [None]:
students_DF = pd.read_csv("../../data/students.csv")
students_DF.head(3)

In [None]:
github_DF = pd.read_csv("../../data/github.csv")
github_DF.head(5)
github_DF["readme_summarized"].fillna("Report Summary not available", inplace=True)

In [None]:
report_contents_DF = pd.read_csv("../../data/report_contents.csv")
report_contents_DF.head(5)

In [None]:
for i, row in ovr_DF.iterrows():
    # print(row)
    print(f"Project Title: {row['project_title']}")
    print(f"Domain: {row['domain']}")
    print(f"Year: {row['year_presented']}")

    # Mentor Portion
    mentor_subset_DF = mentor_DF[mentor_DF['project_id'] == i]
    industries = (",".join(list(set(mentor_subset_DF["ucsd_or_industry"].to_list()))))
    print(f"Industry: {industries}")

    mentors = (",".join(list(set(mentor_subset_DF["mentor_name"].fillna("Not Specified").to_list()))))
    print(f"Mentors: {mentors}")


    #Student
    student_subset_DF = students_DF[students_DF['project_id'] == i]
    students = (",".join(list(set(student_subset_DF["student"].fillna("Not Specified").to_list()))))
    print(f"Students: {students}")

    #Github
    if len(github_DF[github_DF["project_id"] == i]) == 1:
        readme_summary = str(github_DF[github_DF["project_id"] == i]["readme_summarized"])
    else:
        readme_summary = "README not available"
    print(f"Readme Summary: {(readme_summary)}")


    #Github
    if len(report_contents_DF[report_contents_DF["project_id"] == i]) == 1:
        report_summary = str(report_contents_DF[report_contents_DF["project_id"] == i]["text_processed"])
    else:
        report_summary = "Report Summary not available"
    print(f"Report Summary: {(report_summary)}")
    



    # print(f"Project Title: {row['project_title']}")
    # print(f"Project Title: {row['project_title']}")
    # print(f"Project Title: {row['project_title']}")
    print("-" * 75)
    

In [None]:
# Run this line below to start up an elastic search cluster
# docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.11.0

In [None]:
es = Elasticsearch("http://localhost:9200")
es.info().body

In [None]:
# delete model if already downloaded and deployed


In [None]:
# es.ml.put_

In [None]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
get_embeddings("Test sentence").detach().numpy()[0]

In [None]:
# es.indices.delete(index='capstones')

In [None]:
mappings = {
        "properties": {
            "year_presented": {"type": "text"},
            "domain": {"type": "text"},
            "project_title": {"type": "text"},
            "project_title_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
            "industry": {"type": "text"},
            "mentors": {"type": "text"},
            "members": {"type": "text"},
            "report_text_summarization": {"type": "text"},
            "readme_summarization": {"type": "text", "analyzer" : "english"},
            "readme_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
            "report_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"}
    }
}

es.indices.create(index="capstones", mappings=mappings)

In [None]:
readme_vector_dict = pd.read_pickle("../../data/readme_vector_dict.pkl")
report_vector_dict = pd.read_pickle("../../data/report_vector_dict.pkl")
project_title_vector_dict = pd.read_pickle("../../data/project_title_vector_dict.pkl")
lst_es_data_dict = []



for i, row in ovr_DF.iterrows():
    # print(row)

    # print(f"Project Title: {row['project_title']}")
    # print(f"Domain: {row['domain']}")
    # print(f"Year: {row['year_presented']}")

    # project_title_vector = get_embeddings(row['project_title']).detach().numpy()[0]
    project_title_vector = project_title_vector_dict[i] 

    # Mentor Portion
    mentor_subset_DF = mentor_DF[mentor_DF['project_id'] == i]
    industries = (",".join(list(set(mentor_subset_DF["ucsd_or_industry"].to_list()))))
    # print(f"Industry: {industries}")

    mentors = (",".join(list(set(mentor_subset_DF["mentor_name"].fillna("Not Specified").to_list()))))
    # print(f"Mentors: {mentors}")


    #Student
    student_subset_DF = students_DF[students_DF['project_id'] == i]
    students = (",".join(list(set(student_subset_DF["student"].fillna("Not Specified").to_list()))))
    # print(f"Students: {students}")

    #Github
    if len(github_DF[github_DF["project_id"] == i]) == 1:
        readme_summary = str(github_DF[github_DF["project_id"] == i]["readme_summarized"])
    else:
        readme_summary = "README not available"
    # print(f"Readme Summary: {(readme_summary)}")
    # readme_vector = get_embeddings(readme_summary).detach().numpy()[0]
    readme_vector = readme_vector_dict[i] 


    #Github
    if len(report_contents_DF[report_contents_DF["project_id"] == i]) == 1:
        report_summary = str(report_contents_DF[report_contents_DF["project_id"] == i]["text_processed"])
    else:
        report_summary = "Report Summary not available"
    
    # report_vector = get_embeddings(report_summary).detach().numpy()[0]
    report_vector = report_vector_dict[i]
    # print(f"Report Summary: {(report_summary)}")
    

    # "year_presented": {"type": "integer"},
    #         "domain": {"type": "text"},
    #         "project_title": {"type": "text"},
    #         "industry": {"type": "text"},
    #         "mentors": {"type": "text"},
    #         "members": {"type": "text"},
    #         "report_text_summarization": {"type": "text"},
    #         "readme_summarization": {"type": "text", "analyzer" : "english"}

    doc = {
        "year_presented": row['year_presented'],
        "domain": row["domain"],
        "project_title": row["project_title"],
        "project_title_vector": project_title_vector,
        "mentors": mentors,
        "industry": industries,
        "members": students,
        "report_text_summarization": report_summary,
        "readme_summarization": readme_summary,
        "readme_vector": readme_vector,
        "report_vector": report_vector
    }

    doc["project_id"] = i
    lst_es_data_dict.append(doc)
            
    es.index(index="capstones", id=i, document=doc)

    # print(f"Project Title: {row['project_title']}")
    # print(f"Project Title: {row['project_title']}")
    # print(f"Project Title: {row['project_title']}")
    # print("-" * 75)
    

In [None]:
len(lst_es_data_dict)

In [None]:
es_data_dict = pd.DataFrame(lst_es_data_dict)
es_data_dict.to_pickle("../../data/es_data_DF.pkl")

In [None]:
es_data_DF = pd.read_pickle("../../data/es_data_DF.pkl")
es_data_DF

In [None]:
# for 

In [None]:
len(readme_vector_dict)

In [None]:
len(report_vector_dict)

In [None]:
len(project_title_vector_dict)

In [None]:
# file = open("../data/readme_vector_dict.pkl", 'wb')

# # dump information to that file
# pickle.dump(readme_vector_dict, file)

# # close the file
# file.close()

In [None]:
# file = open("../data/project_title_vector_dict.pkl", 'wb')

# # dump information to that file
# pickle.dump(project_title_vector_dict, file)

# # close the file
# file.close()

In [None]:
# resp = es.search(
#     index="capstones",
#     query={
#             "bool": {
#                 "must": [{
#                     "multi_match": {
#                         "query": "Social Analysis",
#                         "fields" : ["project_title", "domain^2"]
#                     }
#                 }, {
#                     "query": {
#                         "mentors": "Justin Eldridge",
#                         "fuzziness" : "AUTO"
#                     }
#                 }]
#             }
#     }
# )
# resp.body

In [None]:
response = es.search(
    index="capstones",
    knn={
      "field": "report_vector",
      "query_vector": get_embeddings("Crypto currency and blockchain").detach().numpy()[0],
      "k": 10,
      "num_candidates": 100
    }
)

response.body

In [None]:
resp = es.search(
    index="capstones",
    query={
            "multi_match": {
                "query": "Jastin Eldrige",
                "fields" : ["mentors"],
                "fuzziness": "AUTO"
            }
        },            
)
resp.body

In [None]:
# healthcare crypo

In [None]:
resp = es.search(
    index="capstones",
    query={
            "multi_match": {
                "query": "Colin Jemmott recommender systems 2021",
                "fields" : ["mentors", "project_title"],
                # "fuzziness": "AUTO"
            }
        },
    knn={
            "field": "report_vector",
            "query_vector": get_embeddings("Colin Jemmott recommender systems 2021").detach().numpy()[0],
            "k": 10,
            "num_candidates": 100
    }            
)
resp.body

In [None]:
resp = es.search(
    index="capstones",
    query={
            "multi_match": {
                "query": "cancer data",
                "type": "phrase",
                "fields" : ["project_title^2", "domain^2", "year_presented", "industry^2", "mentors^3", "members^3", "readme_summarization"],
                "fuzziness": "AUTO",
                # "boost": 0.9
            },
            "multi_match": {
                "query": "cancer data",
                "fields" : ["project_title^2", "domain^2", "year_presented", "industry^2", "mentors^3", "members^3", "readme_summarization"],
                "fuzziness": "AUTO",
                # "boost": 0.9
            },
        },
    knn=[
        {
            "field": "project_title_vector",
            "query_vector": get_embeddings("cancer data").detach().numpy()[0],
            "k": 10,
            "num_candidates": 100,
            # "boost": 0.1
        },
        {
            "field": "readme_vector",
            "query_vector": get_embeddings("cancer data").detach().numpy()[0],
            "k": 10,
            "num_candidates": 100,
            # "boost": 0.1
        },
        {
            "field": "report_vector",
            "query_vector": get_embeddings("cancer data").detach().numpy()[0],
            "k": 10,
            "num_candidates": 100,
            # "boost": 0.1
        } 
    ],
    size=10
)
# resp.body

hits = resp.body['hits']['hits']
print(f'Number of hits: {resp.body["hits"]["total"]["value"]}')
print('----------------')
for hit in hits:
    print(f'Score: {hit["_score"]}')
    print(f'Project: {hit["_source"]["project_title"]}')
    print(f'Domain: {hit["_source"]["domain"]}')
    print(f'Students: {hit["_source"]["members"]}')
    print(f'Industry/UCSD: {hit["_source"]["industry"]}')
    print(f'Mentor: {hit["_source"]["mentors"]}')
    print('----------------')



# mappings = {
#         "properties": {
#             "year_presented": {"type": "text"},
#             "domain": {"type": "text"},
#             "project_title": {"type": "text"},
#             "project_title_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
#             "industry": {"type": "text"},
#             "mentors": {"type": "text"},
#             "members": {"type": "text"},
#             "report_text_summarization": {"type": "text"},
#             "readme_summarization": {"type": "text", "analyzer" : "english"},
#             "readme_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
#             "report_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"}
#     }
# }

In [None]:
# mappings = {
#         "properties": {
#             "year_presented": {"type": "integer"},
#             "domain": {"type": "text"},
#             "project_title": {"type": "text"},
#             "project_title_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
#             "industry": {"type": "text"},
#             "mentors": {"type": "text"},
#             "members": {"type": "text"},
#             "report_text_summarization": {"type": "text"},
#             "readme_summarization": {"type": "text", "analyzer" : "english"},
#             "readme_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"},
#             "report_vector": {"type" : "dense_vector", "dims" : 768, "similarity" : "cosine"}
#     }
# }

In [None]:
example_query_1 = "Justin Eldridge"
example_query_2 = "Crypto Currency"

In [None]:
# Figure out how to multi better
# Add semantic manually

### Building the search query function