In [1]:
! pip install joblib
! pip install faiss-cpu


[notice] A new release of pip available: 22.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [1]:
import numpy as np
import csv
import os
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
import pandas as pd

# Constants
path_vectors = '../data/vectors/'
CHUNK_SIZE = 1000  # Adjust based on memory availability
OUTPUT_PATH = '../data/metrics/papers_cosine.csv'  # Adjust this path as needed
N_JOBS = -1  # Use all available cores

def load_vectors_for_year(year):
    """Load vectors for a specific year using efficient reading."""
    print(f'Reading {year}...')
    
    file_path = os.path.join(path_vectors, f"{year}_vectors.csv")

    # Load the entire CSV into a single numpy array
    data = np.loadtxt(file_path, delimiter='\t', dtype=np.float32)

    # Slice the array to get the desired columns
    papers_ids = data[:, 0].astype(np.int64)  # Assuming the first column is the PaperId
    vectors = data[:, 1:]  # Assuming the rest of the columns are the vectors

    return papers_ids, vectors

def calculate_similarity_for_chunk(chunk, prior_data):
    """Calculate similarity for a chunk using matrix multiplication."""
    # Normalize the vectors
    chunk_norm = chunk / np.linalg.norm(chunk, axis=1, keepdims=True)
    prior_data_norm = prior_data / np.linalg.norm(prior_data, axis=1, keepdims=True)
    
    # Compute cosine similarities using matrix multiplication
    similarities = np.dot(chunk_norm, prior_data_norm.T)
    
    avg_dists = np.mean(similarities, axis=1)
    max_dists = np.max(similarities, axis=1)
    
    return avg_dists, max_dists


def calculate_avg_max_similarity(current_data, prior_data):
    """Calculate average and max cosine similarities for chunks."""
    results = Parallel(n_jobs=N_JOBS)(
        delayed(calculate_similarity_for_chunk)(current_data[i:i+CHUNK_SIZE], prior_data)
        for i in tqdm(range(0, len(current_data), CHUNK_SIZE))
    )
    avg_similarities = np.concatenate([res[0] for res in results])
    max_similarities = np.concatenate([res[1] for res in results])
    return avg_similarities, max_similarities

def initialize_output_file():
    """Initialize the output CSV file with headers."""
    with open(OUTPUT_PATH, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['PaperId', 'cosine_max', 'cosine_avg'])

def save_to_csv(paper_ids, avg_similarities, max_similarities):
    """Append results to CSV."""
    with open(OUTPUT_PATH, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for paper_id, avg_sim, max_sim in zip(paper_ids, avg_similarities, max_similarities):
            writer.writerow([paper_id, max_sim, avg_sim])

C:\Users\u0152835\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
C:\Users\u0152835\Anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [2]:
start_year = 1895
end_year = 2020

In [None]:
rolling_data = []
years = range(start_year, end_year)

# Initialize the output CSV file
initialize_output_file()

for year in tqdm(years):
    papers_ids, current_year_data = load_vectors_for_year(year)

    # Add current year data to rolling data and remove the oldest if length exceeds 5
    rolling_data.append(current_year_data)
    if len(rolling_data) > 5:
        rolling_data.pop(0)

    # If there's not enough prior data, skip the calculations for this year
    if len(rolling_data) < 5:
        continue

    # Combine prior 5 years data
    prior_data = np.vstack(rolling_data[:-1])

    # Calculate cosine similarities
    avg_year_similarities, max_year_similarities = calculate_avg_max_similarity(current_year_data, prior_data)

    # Save results to CSV
    save_to_csv(papers_ids, avg_year_similarities, max_year_similarities)