In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from copy import deepcopy
from tqdm import tqdm
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)
from sklearn.neighbors import NearestNeighbors, NearestCentroid
from sklearn.cluster import AgglomerativeClustering
import ast
import itertools
import vrme_lib

import vrme_hyperparam

In [11]:
def get_atcs(folder, embedding):
    df_embeddings, df_submissions, df_submission_labels, df_embeddings_2017, df_embeddings_2018 = vrme_lib.load_data(path = 'data/' + folder + "/", embedding_path = 'data/embeddings/' + embedding)
    
    atc_results = []
    
    embedding = embedding
    date = folder.split('_')[0] if '_' in folder else folder
    data = folder.split('_', 1)[1] if '_' in folder else folder
   
    atc_results.append(embedding)
    atc_results.append(date)
    atc_results.append(data)
    
    #naive means
    bootstrap_naive_mean = []

    for n in range(5000):
        sample_2017 = df_embeddings_2017.AVG_rating.sample(n=df_embeddings_2017.shape[0], replace = True ,random_state=n)
        sample_2018 = df_embeddings_2018.AVG_rating.sample(n=df_embeddings_2018.shape[0], replace = True ,random_state=n)

        sample_diff = sample_2018.mean() - sample_2017.mean()

        bootstrap_naive_mean.append(sample_diff)

    vrme_lib.print_results(name="Naive ",
                        atc = df_embeddings_2018.AVG_rating.mean() - df_embeddings_2017.AVG_rating.mean(),
                          ci_lower_bound=np.quantile(bootstrap_naive_mean, 0.025),
                          ci_upper_bound=np.quantile(bootstrap_naive_mean, 0.975))
    
    atc_results.append(df_embeddings_2018.AVG_rating.mean() - df_embeddings_2017.AVG_rating.mean())
    atc_results.append(np.quantile(bootstrap_naive_mean, 0.025))
    atc_results.append(np.quantile(bootstrap_naive_mean, 0.975))
    
    df_embeddings, df_submissions, df_submission_labels, df_embeddings_2017, df_embeddings_2018 = vrme_lib.load_data(path = 'data/' + folder + "/", embedding_path = 'data/embeddings/' + embedding)
    

    flipped = 0
    # Check the number of rows in df_embeddings_2017 and df_embeddings_2018
    if len(df_embeddings_2017) > len(df_embeddings_2018):
        # Function to swap 2017 and 2018 in conf_year column
        def swap_conf_year(df):
            df_copy = df.copy()
            temp_value = -1  # Temporary placeholder value
            df_copy.loc[df_copy['conf_year'] == 2017, 'conf_year'] = temp_value
            df_copy.loc[df_copy['conf_year'] == 2018, 'conf_year'] = 2017
            df_copy.loc[df_copy['conf_year'] == temp_value, 'conf_year'] = 2018
            return df_copy

        # Apply the function to all dataframes
        df_submissions = swap_conf_year(df_submissions)
        df_submission_labels = swap_conf_year(df_submission_labels)
        df_embeddings_2017 = swap_conf_year(df_embeddings_2017)
        df_embeddings_2018 = swap_conf_year(df_embeddings_2018)

        # Switch the names of df_embeddings_2017 and df_embeddings_2018
        df_embeddings_2017, df_embeddings_2018 = df_embeddings_2018, df_embeddings_2017

        #indicate that we flipped the dfs
        flipped = 1


    HYPERPARAM_a_max_dist_threshold = 0.1

    x = np.array(df_embeddings_2017.embedding.tolist())
    clustering = sklearn.cluster.AgglomerativeClustering(
    n_clusters=None, metric = 'cosine', distance_threshold=HYPERPARAM_a_max_dist_threshold, linkage="average").fit(x)
    df_embeddings_2017['agg_cluster'] = clustering.labels_.tolist()

    #get new max cosine hyperparam
    num = vrme_hyperparam.find_max_cosine(df_embeddings_2017, df_embeddings_2018, clustering, df_submission_labels)

    HYPERPARAM_k_num_neighbors = 10

    # NEW PARAM FOUND ABOVE - ENSURE CAUSAL OVERLAP
    HYPERPARAM_b_max_cosine = num
    

    #setting up KNN for 2018
    neigh = NearestNeighbors(n_neighbors=HYPERPARAM_k_num_neighbors, metric = 'cosine', radius = 0.3)
    non_anchor_embedding_2018 = np.array(df_embeddings_2018.embedding.to_list())
    neigh.fit(non_anchor_embedding_2018)

    #setting up closest centroid for anchor group 2017
    anchor_embedding_2017 = np.array(df_embeddings_2017.embedding.tolist())
    anchor_agg_clusters_2017 = np.array(df_embeddings_2017.agg_cluster.tolist())
    clf = NearestCentroid()
    clf.fit(anchor_embedding_2017, anchor_agg_clusters_2017)


    #dictionary of all the agg clusters and the 10 KNN from 2018
    dict_agg_cluster_matches ={}
    for cluster_id in np.unique(clustering.labels_):

        distances, indices = neigh.kneighbors([clf.centroids_[cluster_id]])
        df_anchor_embedding = pd.concat([pd.DataFrame(data = distances.T,columns =['cos_dist']),pd.DataFrame(indices.T,columns=['indices'])],axis=1)

        #get all the specified cosine distance 2018 papers
        #tuple of (dataframe of 2018 matched papers, cosine distances)
        dict_agg_cluster_matches[cluster_id] = (
            df_embeddings_2018.iloc[df_anchor_embedding[df_anchor_embedding['cos_dist']<= HYPERPARAM_b_max_cosine].indices.to_list(), :],
            df_anchor_embedding[df_anchor_embedding['cos_dist']<= HYPERPARAM_b_max_cosine].cos_dist.to_list()
        )


    def lambda_get_2018_matches(row):
        #get embedding matches from 2018 papers
        #returning relevant information
        df_clustered_papers = dict_agg_cluster_matches[row.agg_cluster]
        lst_paper_titles = df_clustered_papers[0].title.tolist()
        lst_paper_ids = df_clustered_papers[0].paper_id.tolist()
        ls_paper_keywords = df_clustered_papers[0].keywords.values.tolist()
        ls_cos_distances = df_clustered_papers[1]

        return lst_paper_titles, ls_paper_keywords, lst_paper_ids, ls_cos_distances

    def get_num_knn_matches(row):
        return(len(row.titles_2018))

    df_embeddings_2017[['titles_2018','keywords_2018','id_2018','cos_dist_2018']]= df_embeddings_2017.apply(lambda x: lambda_get_2018_matches(x),axis=1, result_type ='expand')
    df_embeddings_2017['num_knn_matches'] = df_embeddings_2017.apply(lambda x: get_num_knn_matches(x),axis =1)

    assert df_embeddings_2017.shape[0] == df_submission_labels[df_submission_labels['conf_year']==2017].shape[0]

    #df_embeddings_2017.num_knn_matches.unique()

    df_embeddings_2017['match_ave_rating'] = df_embeddings_2017.apply(lambda row: vrme_lib.lambda_get_match_potential_outcomes(row, df_embeddings_2018), axis =1)
    df_embeddings_2017['diff_2018_2017'] = df_embeddings_2017['match_ave_rating'] - df_embeddings_2017['AVG_rating']

    df_embeddings_2017 = df_embeddings_2017.loc[df_embeddings_2017['match_ave_rating'].notnull(),]

    # Eqn 8
    KNN_ATT = (df_embeddings_2017['match_ave_rating'] - df_embeddings_2017['AVG_rating']).sum()/df_embeddings_2017.shape[0]
    assert df_embeddings_2017['match_ave_rating'].shape[0] == df_embeddings_2017['AVG_rating'].shape[0]

    #bootstrap KNN confidence interval
    bootstrap_mean = []
    for n in range(5000):
        sample = df_embeddings_2017.diff_2018_2017.sample(n=df_embeddings_2017.shape[0], replace = True ,random_state=n)
        bootstrap_mean.append(sample.mean())

    if(flipped != 0):
        vrme_lib.print_results(name="VRM-E",
                              atc=-statistics.mean(bootstrap_mean),
                              ci_upper_bound=-np.quantile(bootstrap_mean, 0.025),
                              ci_lower_bound=-np.quantile(bootstrap_mean, 0.975))
        atc_results.append(-statistics.mean(bootstrap_mean))
        atc_results.append(-np.quantile(bootstrap_mean, 0.975))
        atc_results.append(-np.quantile(bootstrap_mean, 0.025))

    else:
        vrme_lib.print_results(name="VRM-E",
                              atc=statistics.mean(bootstrap_mean),
                              ci_lower_bound=np.quantile(bootstrap_mean, 0.025),
                              ci_upper_bound=np.quantile(bootstrap_mean, 0.975))
        
        atc_results.append(statistics.mean(bootstrap_mean))
        atc_results.append(np.quantile(bootstrap_mean, 0.025))
        atc_results.append(np.quantile(bootstrap_mean, 0.975))
        
    atc_results.append(HYPERPARAM_b_max_cosine)
    return atc_results
    
        

In [None]:
embeddings = ["df_embeddings.csv", "doc2vec100_embeddings.csv", "doc2vec1000_embeddings.csv", "bow_embeddings.csv"]

dates = ["reviewer" , "rebuttal", "decision"]

data = ["all_2018", "found_2018", "all_2017_found_2018", "all_2017_2018"]

for embedding in embeddings:
    print("--------------------------------------------------------")
    print(embedding)
    print("--------------------------------------------------------")
    for date in dates:
        for data_s in data:
            folder = date + "_" + data_s
            print("**" + folder + "**")
            get_atcs(folder, embedding)
            print("\n")