## Create a new Index

In [13]:
from Helper.timing import Timing
from Helper.ORM import fetch_records
from Pipeline.index.index import Index

with Timing('Creating Index...'):
    records = fetch_records(model="Corpus")
    documents = [rec.text for rec in records]
    index = Index(records)
    index.save(model_name="Pipeline/index/Saved/wiki_model.pickle",
               tfidf_name="Pipeline/index/Saved/wiki_tfidf.pickle",
               keys_name="Pipeline/index/Saved/wiki_keys.pickle")
    
print(index.tfidf_matrix.shape[1])

## Load existing Index

In [3]:
from Pipeline.index.index import Index

index = Index.load(model_name="Pipeline/index/Saved/Wiki/countries_dates_ord/model369721.pickle",
                   tfidf_name="Pipeline/index/Saved/Wiki/countries_dates_ord/tfidf369721.pickle",
                   keys_name="Pipeline/index/Saved/Wiki/countries_dates_ord/keys369721.pickle")

print(index.tfidf_matrix.shape[1])

861513


# Evaluate the currently loaded index

In [None]:
from Pipeline.Evaluation.eval import evaluate
from Helper.timing import Timing


with Timing('Evaluating Documents...'):
    ev = evaluate(index, 
                  rel=1,
                  qrels_path="Pipeline/Evaluation/Wiki/qrels_wiki",
                  queries_path="Pipeline/Evaluation/Wiki/queries.csv", 
                  run_path="Pipeline/Evaluation/Wiki/run_wiki_lower_stop_html_countries_numerize", 
                  create_run_file_bool=True)

    print(ev)

# Show Clustered Index Plots

In [None]:
from Clustering import cluster_index

cluster_index(index)

# Search for a certain term

In [3]:
from Helper.timing import Timing

with Timing('searching...'):
  top_documents = index.search('yanni')
  for doc_id, score, idx in top_documents:
      print(f"Document {doc_id}: Similarity Score {score}")

## Export all the featrures to a file

In [4]:
feature_names = index.vectorizer.get_feature_names_out()
  
with open('feature_names/countries_dates_ord_features.txt', 'w') as f:
  # Write each feature name to a separate line in the file
  for feature_name in feature_names:
    f.write(feature_name + '\n')
  