# Stage 1: Basic content search by tf-idf

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from scipy.sparse import save_npz, load_npz, csr_matrix

from scipy.spatial.distance import cosine

import preprocessing
import my_tfidf

In [None]:
dtypes = {'cord_uid': str, 'sha': str, 'source_x': str, 'title': str, 'doi': str, 'pmcid': str, 'pubmed_id': str,
       'license': str, 'abstract': str, 'publish_time': str, 'authors': str, 'journal': str, 'mag_id': str,
       'who_covidence_id': str, 'arxiv_id': str, 'pdf_json_files': str, 'pmc_json_files': str,
       'url': str, 's2_id': str, 'search_text': str, 'date': str}

In [None]:
# load dataframe, filter only papers from 2021
path = 'results/final_models/metadata_2021.csv.gz'
data = pd.read_csv(path, sep='\t', dtype=dtypes)
data.date = pd.to_datetime(data.date)
data = data[data.date.apply(lambda x: x.year == 2021)]

data = data[['cord_uid', 'date', 'title', 'abstract', 'authors', 'doi',
      'url', 'pdf_json_files', 'pmc_json_files', 'search_text']]
documents = data.search_text
index = data['cord_uid'].values

In [None]:
# # save to csv
# data.to_csv('results/final_models/metadata_2021.csv.gz', index=False, sep='\t', compression='gzip')

### Vectorize

In [None]:
path = 'results/final_models/'

In [None]:
# # option 1: create vectorizer (uncomment desired option)
# vectorizer = my_tfidf.make_vectorizer(documents, pickle_path=path, save_files_prefix="_2021")

# option 2: load vectorizer from file
with open('results/final_models/streamlit_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)
vectorizer = my_tfidf.load_vectorizer(path + 'vectorizer.pkl')

In [None]:
# # option 1: create term-document matrix with vectorizer
# tdm = vectorizer.transform(documents)
# save_npz(path + 'streamlit_tdm.npz', tdm)

# option 2: load term-document matrix from file
tdm = load_npz(path + '2021_tdm.npz')

### Run search on queries

In [None]:
def search_write_queries(queries, vectorizer, tdm, index, metadata, save_directory, num_top_results=5):
    
    def write_results(results_df, query, save_directory, filename):
        path = save_directory + filename
        with open(path, 'w') as file:
            file.write(query + '\n\n\n')
            for i in range(len(results)):
                row = results.iloc[i]
                file.write(f'Result {i+1}: uid {row.cord_uid}\n\n{row.title}\n\n{row.abstract}\n\n\n')
    
    
    for i in range(len(queries)):
        query = queries[i]
        results = my_tfidf.tfidf_search(query, vectorizer, tdm, index, 
                                        metadata, num_top_results=5)
        filename = f'q{i}'
        write_results(results, query, save_directory, filename)    

In [None]:
# load list of queries
queries = pd.read_csv('data/processed/questions_expert.csv', sep='\t', index_col=0).question.values

In [None]:
# run search, write results to .txt files
save_directory = 'results/final_models/tfidf_results/'
search_write_queries(queries, vectorizer, tdm, index, data, save_directory)