## Create a new Index

In [13]:
from Helper.timing import Timing
from Helper.ORM import fetch_records
from Pipeline.index.index import Index

with Timing('Creating Index...'):
    records = fetch_records(model="Corpus")
    documents = [rec.text for rec in records]
    index = Index(records)
    index.save(model_name="Pipeline/index/Saved/wiki_model.pickle",
               tfidf_name="Pipeline/index/Saved/wiki_tfidf.pickle",
               keys_name="Pipeline/index/Saved/wiki_keys.pickle")
    
print(index.tfidf_matrix.shape[1])

Creating Index... elapsed time: 391.8957905769348
794562


## Load existing Index

In [1]:
from Pipeline.index.index import Index

index = Index.load(model_name="Pipeline/index/Saved/Lower_Processed/wiki_model369721.pickle",
                   tfidf_name="Pipeline/index/Saved/Lower_Processed/wiki_tfidf369721.pickle",
                   keys_name="Pipeline/index/Saved/Lower_Processed/wiki_keys369721.pickle")

print(index.tfidf_matrix.shape[1])



794562


# Evaluate the currently loaded index

In [10]:
from Pipeline.Evaluation.eval import evaluate
from Helper.timing import Timing


with Timing('Evaluating Documents...'):
    ev = evaluate(index, 
                  rel=1,
                  qrels_path="Pipeline/Evaluation/Wiki/qrels_wiki",
                  queries_path="Pipeline/Evaluation/Wiki/queries.csv", 
                  run_path="Pipeline/Evaluation/Wiki/run_wiki_lower", 
                  create_run_file_bool=False)

    print(ev)

{AP: 0.10026601487943489, R@10: 0.1406048526218518, P@10: 0.15907202216066288, RR: 0.5623883772429485}
Evaluating Documents... elapsed time: 0.1963956356048584


# Show Clustered Index Plots

In [None]:
from Clustering import cluster_index

cluster_index(index)

# Search for a certain term

In [3]:
from Helper.timing import Timing

with Timing('searching...'):
  top_documents = index.search('yanni')
  for doc_id, score, idx in top_documents:
      print(f"Document {doc_id}: Similarity Score {score}")

Document 806300: Similarity Score 0.6787888635713526
Document 123839: Similarity Score 0.5850978131147955
Document 836567: Similarity Score 0.4476886417858691
Document 806075: Similarity Score 0.4391356840995299
Document 806326: Similarity Score 0.42755843764813245
Document 1793430: Similarity Score 0.3579832341169637
Document 806263: Similarity Score 0.2609891956629828
searching... elapsed time: 2.7566237449645996


In [9]:
path = 'Pipeline/Evaluation/Wiki/run_wiki_lower'
f = open(path, 'r')

res = []
for line in f.readlines():
  string = line
  string = string.split()
  _, _, _, _, score, _ = string
  if float(score) > 0.3:
    res.append(line)
  
f.close()
open(path, 'w').close()  
f = open(path, 'w')
f.write(''.join(res))
f.close()