In [None]:
! pip install joblib
! pip install faiss-cpu

In [42]:
start_year = 1895
end_year = 2020

In [77]:
import numpy as np
import csv
import os
import faiss
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

# Constants
path_vectors = 'D:/Users/Nicola Melluso/Work/MAG/NBER/data/papers_vectors/'
CHUNK_SIZE = 1000  # Adjust based on memory availability
OUTPUT_PATH = '../data/metrics/papers_cosine.csv'  # Adjust this path as needed
N_JOBS = -1  # Use all available cores

def load_vectors_for_year(year):
    """Load vectors for a specific year using efficient reading."""
    print(f'Reading {year}...')
    file_path = os.path.join(path_vectors, f"{year}_vectors.csv")
    vectors = np.loadtxt(file_path, delimiter='\t', usecols=range(1, 769), dtype=np.float32)  # Adjust column range if needed and ensure dtype is float32
    papers_ids = np.loadtxt(file_path, delimiter='\t', usecols=range(0, 1), dtype=np.int64)
    return papers_ids, vectors

def calculate_similarity_for_chunk(chunk, prior_data):
    """Calculate similarity for a chunk using faiss."""
    # Ensure data type is float32
    chunk = chunk.astype(np.float32)
    prior_data = prior_data.astype(np.float32)
    
    # Normalize the vectors (L2 normalization)
    faiss.normalize_L2(chunk)
    faiss.normalize_L2(prior_data)
    
    # Use IndexFlatIP (Inner Product) after normalization for cosine similarity
    index = faiss.IndexFlatIP(prior_data.shape[1])
    index.add(prior_data)
    
    # Search for the most similar vector
    similarities, _ = index.search(chunk, 1)
    
    avg_sim = np.mean(similarities, axis=1)
    max_sim = np.max(similarities, axis=1)
    
    return avg_sim, max_sim


def calculate_avg_max_similarity(current_data, prior_data):
    """Calculate average and max cosine similarities for chunks."""
    results = Parallel(n_jobs=N_JOBS)(
        delayed(calculate_similarity_for_chunk)(current_data[i:i+CHUNK_SIZE], prior_data)
        for i in tqdm(range(0, len(current_data), CHUNK_SIZE))
    )
    avg_similarities = np.concatenate([res[0] for res in results])
    max_similarities = np.concatenate([res[1] for res in results])
    return avg_similarities, max_similarities

def initialize_output_file():
    """Initialize the output CSV file with headers."""
    with open(OUTPUT_PATH, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['PaperId', 'cosine_max', 'cosine_avg'])

def save_to_csv(paper_ids, avg_similarities, max_similarities):
    """Append results to CSV."""
    with open(OUTPUT_PATH, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for paper_id, avg_sim, max_sim in zip(paper_ids, avg_similarities, max_similarities):
            writer.writerow([paper_id, max_sim, avg_sim])

def main():
    rolling_data = []
    years = range(start_year, end_year)

    # Initialize the output CSV file
    initialize_output_file()

    for year in tqdm(years):
        papers_ids, current_year_data = load_vectors_for_year(year)
        
        # Add current year data to rolling data and remove the oldest if length exceeds 5
        rolling_data.append(current_year_data)
        if len(rolling_data) > 5:
            rolling_data.pop(0)
        
        # If there's not enough prior data, skip the calculations for this year
        if len(rolling_data) < 5:
            continue
        
        # Combine prior 5 years data
        prior_data = np.vstack(rolling_data[:-1])
        
        # Calculate cosine similarities
        avg_year_similarities, max_year_similarities = calculate_avg_max_similarity(current_year_data, prior_data)
        
        # Save results to CSV
        # Assuming the first column in the vectors CSV is the PaperId
        #paper_ids = [row[0] for row in current_year_data]
        save_to_csv(papers_ids, avg_year_similarities, max_year_similarities)

if __name__ == "__main__":
    main()


  0%|          | 0/125 [00:00<?, ?it/s]

Reading 1895...
Reading 1896...


  0%|          | 0/20 [00:00<?, ?it/s]

Reading 1897...


  0%|          | 0/21 [00:00<?, ?it/s]

Reading 1898...


  0%|          | 0/21 [00:00<?, ?it/s]

Reading 1899...


  0%|          | 0/21 [00:00<?, ?it/s]

Reading 1900...


  0%|          | 0/22 [00:00<?, ?it/s]

Reading 1901...


  0%|          | 0/23 [00:00<?, ?it/s]

Reading 1902...


  0%|          | 0/23 [00:00<?, ?it/s]

Reading 1903...


  0%|          | 0/24 [00:00<?, ?it/s]

Reading 1904...


  0%|          | 0/24 [00:00<?, ?it/s]

Reading 1905...


  0%|          | 0/25 [00:00<?, ?it/s]

Reading 1906...


  0%|          | 0/26 [00:00<?, ?it/s]

Reading 1907...


  0%|          | 0/27 [00:00<?, ?it/s]

Reading 1908...


  0%|          | 0/29 [00:00<?, ?it/s]

Reading 1909...


  0%|          | 0/31 [00:00<?, ?it/s]

Reading 1910...


  0%|          | 0/32 [00:00<?, ?it/s]

Reading 1911...


  0%|          | 0/34 [00:00<?, ?it/s]

Reading 1912...


  0%|          | 0/34 [00:00<?, ?it/s]

Reading 1913...


  0%|          | 0/35 [00:00<?, ?it/s]

Reading 1914...


  0%|          | 0/34 [00:00<?, ?it/s]

Reading 1915...


  0%|          | 0/30 [00:00<?, ?it/s]

Reading 1916...


  0%|          | 0/29 [00:00<?, ?it/s]

Reading 1917...


KeyboardInterrupt: 