In [1]:
import os
dr_xing_results = '/trinity/home/nikita.sukhorukov/CQFS/results/XingChallenge2017/ICM_all'
dr_tmd_results = '/trinity/home/nikita.sukhorukov/CQFS/results/TheMoviesDataset/ICM_metadata/'
dr_cite_results = '/trinity/home/nikita.sukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/'

In [2]:
PS = [40, 60, 80, 95]

In [3]:
import re
REGEX_RESULTS = re.compile(
    r'.*PRECISION: (?P<precision>.*).*'
    r'.*RECALL: (?P<recall>.*).*'
    r'.*NDCG: (?P<ndcg>.*).*'
    r'.*MAP: (?P<map>.*).*'
    r'.*COVERAGE_ITEM: (?P<coverage>.*).*'
    r'.*DIVERSITY_GINI: (?P<gini>.*).*'
    r'.*DIVERSITY_MEAN_INTER_LIST: (?P<mil>.*).*'
)

In [4]:
import re

CUTOFF = [5, 10, 20, 50]

def find_validation_results(fin):
    metrics = {}
    for line in fin:
        if line.startswith('SearchBayesianSkopt: New best config found.'):
            metrics = {
                metric: float(re.search(rf'{metric}: ([\.0-9]+)', line).group(1))
                for metric in (
                    'PRECISION', 'RECALL', 'NDCG', 'MAP', 'COVERAGE_USER',
                    'COVERAGE_ITEM', 'DIVERSITY_GINI', 'DIVERSITY_MEAN_INTER_LIST'
                )
            }
    return metrics
    
def find_test_results(fin):
    metrics = {}
    for line in fin:
        if line.startswith('SearchBayesianSkopt: Best config evaluated with evaluator_test with constructor data for final test'):
            break
    for cutoff, results in zip(CUTOFF, [fin.readline().strip() for i in range(4)]):
        metrics[cutoff] = {
            metric: float(re.search(rf'{metric}: ([\.0-9]+)', results).group(1))
            for metric in (
                'PRECISION', 'RECALL', 'NDCG', 'MAP', 'COVERAGE_USER',
                'COVERAGE_ITEM', 'DIVERSITY_GINI', 'DIVERSITY_MEAN_INTER_LIST'
            )
        }
    return metrics

In [5]:
import re
import numpy as np
REGEX_EXP_DIR = re.compile('^a0(\d+)r(\d+)d([+-]?\d+)p0(\d{2})$')

def get_baseline_results(dr, *, ICM_name):
    dr_knn = os.path.join(dr, 'ItemKNNCBFRecommender')
    dr_tfidf = os.path.join(dr, 'TFIDF')
    dr_popular = os.path.join(dr, 'popular')
    # dr_random = os.path.join(dr, 'random_s0')
    dr_cfecbf_knn = os.path.join(dr, 'CFW_D_Similarity_Cython', 'ItemKNNCFRecommender')
    dr_cfecbf_puresvd = os.path.join(dr, 'CFW_D_Similarity_Cython', 'PureSVDItemRecommender')
    dr_cfecbf_rp3beta = os.path.join(dr, 'CFW_D_Similarity_Cython', 'RP3betaRecommender')
    experiments = []
    experiments.append(
        {
            'path': os.path.join(dr_knn, f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
            'name': 'ItemKNN CBF',
        }
    )
    experiments.extend([
        {
            'path': os.path.join(dr_tfidf, 'p%03d' % p, 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
            'name': f'TFIDF {p}%',
        }
        for p in [5, 20, 40, 60, 80, 95]
    ])
    experiments.extend([
        {
            'path': os.path.join(dr_popular, 'p%03d' % p, 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
            'name': f'popular {p}%',
        }
        for p in [5, 20, 40, 60, 80, 95]
    ])

    # experiments.extend([
    #     {
    #         'path': os.path.join(dr_cfecbf_knn, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
    #         'name': 'CFeCBF ItemKNN',
    #     },
    #     {
    #         'path': os.path.join(dr_cfecbf_puresvd, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
    #         'name': 'CFeCBF PureSVD',
    #     },
    #     {
    #         'path': os.path.join(dr_cfecbf_rp3beta, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
    #         'name': 'CFeCBF RP3Beta',
    #     },
    # ])
    
    for experiment in experiments:
        with open(experiment['path'], 'r') as fin:
            experiment.setdefault('metrics', {})['validation'] = find_validation_results(fin)
            fin.seek(0)
            experiment.setdefault('metrics', {})['test'] = find_test_results(fin)
    
    return experiments

    

def get_baseline_results_artyom(dr, *, ICM_name):
    dr_knn = os.path.join(dr, 'ItemKNNCBFRecommender')
    dr_tfidf = os.path.join(dr, 'TFIDF')
    dr_popular = os.path.join(dr, 'popular')
    dr_cfecbf_knn = os.path.join(dr, 'CFW_D_Similarity_Cython', 'ItemKNNCFRecommender')
    dr_cfecbf_puresvd = os.path.join(dr, 'CFW_D_Similarity_Cython', 'PureSVDItemRecommender')
    dr_cfecbf_rp3beta = os.path.join(dr, 'CFW_D_Similarity_Cython', 'RP3betaRecommender')
    experiments = []
    experiments.append(
        {
            'path': os.path.join(dr_knn, f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
            'name': 'ItemKNN CBF',
        }
    )
    experiments.extend([
        {
            'path': os.path.join(dr_tfidf, 'p%03d' % p, 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
            'name': f'TFIDF {p}%',
        }
        for p in [40, 60, 80, 95]
    ])
    # experiments.extend([
    #     {
    #         'path': os.path.join(dr_popular, 'p%03d' % p, 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'),
    #         'name': f'popular {p}%',
    #     }
    #     for p in [20, 40, 60, 80, 95]
    # ])
    experiments.extend([
        {
            'path': os.path.join(dr_cfecbf_knn, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
            'name': 'CFeCBF ItemKNN',
        },
        {
            'path': os.path.join(dr_cfecbf_puresvd, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
            'name': 'CFeCBF PureSVD',
        },
        {
            'path': os.path.join(dr_cfecbf_rp3beta, f'CFW_D_Similarity_Cython_{ICM_name}_SearchBayesianSkopt.txt'),
            'name': 'CFeCBF RP3Beta',
        },
    ])
    
    for experiment in experiments:
        with open(experiment['path'], 'r') as fin:
            experiment.setdefault('metrics', {})['validation'] = find_validation_results(fin)
            fin.seek(0)
            experiment.setdefault('metrics', {})['test'] = find_test_results(fin)
    
    return experiments

    
def get_experiment_results(dr, *, ICM_name, recommender='ItemKNNCFRecommender'):
    done = 0
    not_done = 0
    experiments = []
    dr_cqfs = os.path.join(dr, recommender)
    for file in os.listdir(dr_cqfs):
        match = REGEX_EXP_DIR.match(file)
        if not match:
            continue

        a, r, d, p = match.groups()
        a = int(a) / 10
        r = int(r)
        d = int(d) / 10
        p = int(p)
        # print(a, r, d, p)
        path = os.path.join(dr_cqfs, file, 'cqfs_hsvd', 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt')
        
        if os.path.exists(path):
            with open(path, 'r') as fin:
                flag = False
                for line in fin:
                    if line.startswith('SearchBayesianSkopt: Best config evaluated with evaluator_test with constructor data for final test'):
                        flag = True
                if not flag:
                    not_done += 1
                    # print(a, r, d, p)
                if flag:
                    done += 1
                    experiment = {
                            'path': os.path.join(
                                dr_cqfs,
                                file, 'cqfs_hsvd', 'ItemKNNCBFRecommender', f'ItemKNNCBFRecommender_{ICM_name}_cosine_SearchBayesianSkopt.txt'
                            ),
                            'name': f'CQFS {recommender.replace("ItemRecommender", "").replace("Recommender", "").replace("CF", "").replace("RP3beta", "RP3Beta")} {p}%',
                            'alpha': a,
                            'rank': r,
                            'd': d,
                            'p': p,
                            # '%': f'{sum(selection) / len(selection) * 100:.2f}',
                            '%': p,
                        }
                    with open(experiment['path'], 'r') as fin1:
                        experiment.setdefault('metrics', {})['validation'] = find_validation_results(fin1)
                        fin1.seek(0)
                        experiment.setdefault('metrics', {})['test'] = find_test_results(fin1)
            
                    experiments.append(experiment)

    experiments.sort(key=lambda x: x['p'])
    print(f'{done} + {not_done} = {done + not_done} ({done / (done + not_done)})')
    return experiments

In [6]:
xing_baselines = get_baseline_results(dr_xing_results, ICM_name='ICM_all')

In [7]:
xing_experiments_svd = get_experiment_results(
    dr_xing_results, ICM_name='ICM_all',
    recommender='PureSVDItemRecommender'
)

189 + 276 = 465 (0.4064516129032258)


In [8]:
cite_baselines = get_baseline_results(dr_cite_results, ICM_name='ICM_title_abstract')

In [9]:
cite_experiments_svd = get_experiment_results(
    dr_cite_results, ICM_name='ICM_title_abstract',
    recommender='PureSVDItemRecommender'
)

391 + 113 = 504 (0.7757936507936508)


In [10]:
tmd_baselines = get_baseline_results(dr_tmd_results, ICM_name='ICM_metadata')

In [11]:
tmd_experiments_svd = get_experiment_results(
    dr_tmd_results, ICM_name='ICM_metadata',
    recommender='PureSVDItemRecommender'
)

361 + 503 = 864 (0.41782407407407407)


In [12]:
import json, os
results_json = {
    'cite': {
        'baseline': cite_baselines,
        # 'knn': cite_experiments_knn,
        'svd': cite_experiments_svd,
        # 'rp3b': cite_experiments_rp3b,
    },
    'tmd': {
        'baseline': tmd_baselines,
        # 'knn': cite_experiments_knn,
        'svd': tmd_experiments_svd,
        # 'rp3b': cite_experiments_rp3b,
    },
    'xing': {
        'baseline': xing_baselines,
        'svd': xing_experiments_svd
    }
}
with open('results.json', 'w') as fout:
    json.dump(results_json, fout)

In [14]:
cite_experiments_svd[2]

{'path': '/trinity/home/nikita.sukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/PureSVDItemRecommender/a04r100d-15p005/cqfs_hsvd/ItemKNNCBFRecommender/ItemKNNCBFRecommender_ICM_title_abstract_cosine_SearchBayesianSkopt.txt',
 'name': 'CQFS PureSVD 5%',
 'alpha': 0.4,
 'rank': 100,
 'd': -1.5,
 'p': 5,
 '%': 5,
 'metrics': {'validation': {'PRECISION': 0.0978441,
   'RECALL': 0.2615136,
   'NDCG': 0.1963239,
   'MAP': 0.1289849,
   'COVERAGE_USER': 0.8690326,
   'COVERAGE_ITEM': 0.3314127,
   'DIVERSITY_GINI': 0.1698469,
   'DIVERSITY_MEAN_INTER_LIST': 0.9882531},
  'test': {5: {'PRECISION': 0.1386489,
    'RECALL': 0.1148768,
    'NDCG': 0.1217012,
    'MAP': 0.105929,
    'COVERAGE_USER': 0.9760404,
    'COVERAGE_ITEM': 0.7983168,
    'DIVERSITY_GINI': 0.346182,
    'DIVERSITY_MEAN_INTER_LIST': 0.9947153},
   10: {'PRECISION': 0.1186969,
    'RECALL': 0.1877255,
    'NDCG': 0.162455,
    'MAP': 0.1031437,
    'COVERAGE_USER': 0.9760404,
    'COVERAGE_ITEM': 0.9107304,
    'DIVERS

In [26]:
dr_cite_results_artyom = '/mnt/bulky/anikitin/Developer/CQFS/results/CiteULike_a/ICM_title_abstract/'

In [27]:
cite_baselines_artyom = get_baseline_results_artyom(dr_cite_results_artyom, ICM_name='ICM_title_abstract')

In [28]:
cite_baselines = get_baseline_results(dr_cite_results, ICM_name='ICM_title_abstract')


In [29]:
cite_baselines

[{'path': '/mnt/bulky2/nsukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/popular/p020/ItemKNNCBFRecommender/ItemKNNCBFRecommender_ICM_title_abstract_cosine_SearchBayesianSkopt.txt',
  'name': 'popular 20%',
  'metrics': {'validation': {'PRECISION': 0.1134768,
    'RECALL': 0.3096767,
    'NDCG': 0.2372417,
    'MAP': 0.1613923,
    'COVERAGE_ITEM': 0.3219192,
    'DIVERSITY_GINI': 0.1700835,
    'DIVERSITY_MEAN_INTER_LIST': 0.9885168},
   'test': {5: {'PRECISION': 0.1706534,
     'RECALL': 0.1522832,
     'NDCG': 0.1596466,
     'MAP': 0.1387608,
     'COVERAGE_ITEM': 0.7832542,
     'DIVERSITY_GINI': 0.3452774,
     'DIVERSITY_MEAN_INTER_LIST': 0.9949173},
    10: {'PRECISION': 0.1387228,
     'RECALL': 0.234395,
     'NDCG': 0.2048676,
     'MAP': 0.1332815,
     'COVERAGE_ITEM': 0.9207245,
     'DIVERSITY_GINI': 0.4310629,
     'DIVERSITY_MEAN_INTER_LIST': 0.9925847},
    20: {'PRECISION': 0.1065707,
     'RECALL': 0.3357168,
     'NDCG': 0.250566,
     'MAP': 0.1397936,
     

In [30]:
len(cite_baselines_artyom)

8

In [37]:
cite_baselines[6]

{'path': '/mnt/bulky2/nsukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/random_s0/p040/ItemKNNCBFRecommender/ItemKNNCBFRecommender_ICM_title_abstract_cosine_SearchBayesianSkopt.txt',
 'name': 'random 40%',
 'metrics': {'validation': {'PRECISION': 0.1006268,
   'RECALL': 0.2768023,
   'NDCG': 0.2128156,
   'MAP': 0.1423997,
   'COVERAGE_ITEM': 0.3237109,
   'DIVERSITY_GINI': 0.1559972,
   'DIVERSITY_MEAN_INTER_LIST': 0.9867776},
  'test': {5: {'PRECISION': 0.1545958,
    'RECALL': 0.1374889,
    'NDCG': 0.1451903,
    'MAP': 0.1237805,
    'COVERAGE_ITEM': 0.7811758,
    'DIVERSITY_GINI': 0.323503,
    'DIVERSITY_MEAN_INTER_LIST': 0.9943174},
   10: {'PRECISION': 0.1263381,
    'RECALL': 0.2121832,
    'NDCG': 0.1866683,
    'MAP': 0.1195884,
    'COVERAGE_ITEM': 0.9118171,
    'DIVERSITY_GINI': 0.3973434,
    'DIVERSITY_MEAN_INTER_LIST': 0.9915396},
   20: {'PRECISION': 0.0966131,
    'RECALL': 0.3033097,
    'NDCG': 0.2276502,
    'MAP': 0.1250831,
    'COVERAGE_ITEM': 0.9833729

In [31]:
len(cite_baselines)

10

In [32]:
cite_baselines_artyom

[{'path': '/mnt/bulky/anikitin/Developer/CQFS/results/CiteULike_a/ICM_title_abstract/ItemKNNCBFRecommender/ItemKNNCBFRecommender_ICM_title_abstract_cosine_SearchBayesianSkopt.txt',
  'name': 'ItemKNN CBF',
  'metrics': {'validation': {'PRECISION': 0.1281095,
    'RECALL': 0.3638505,
    'NDCG': 0.2816181,
    'MAP': 0.1972879,
    'COVERAGE_ITEM': 0.3363799,
    'DIVERSITY_GINI': 0.1868928,
    'DIVERSITY_MEAN_INTER_LIST': 0.989447},
   'test': {5: {'PRECISION': 0.2053525,
     'RECALL': 0.1855613,
     'NDCG': 0.1976089,
     'MAP': 0.1794579,
     'COVERAGE_ITEM': 0.8425008,
     'DIVERSITY_GINI': 0.3959207,
     'DIVERSITY_MEAN_INTER_LIST': 0.995404},
    10: {'PRECISION': 0.1634367,
     'RECALL': 0.2793788,
     'NDCG': 0.2492682,
     'MAP': 0.1725762,
     'COVERAGE_ITEM': 0.9576195,
     'DIVERSITY_GINI': 0.4835576,
     'DIVERSITY_MEAN_INTER_LIST': 0.9932751},
    20: {'PRECISION': 0.1231082,
     'RECALL': 0.3906961,
     'NDCG': 0.2997592,
     'MAP': 0.1795838,
     'COVERA

In [33]:
dr_cite_results

'/mnt/bulky2/nsukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/'

In [34]:
# cite_experiments_knn = get_experiment_results(
#     dr_cite_results, ICM_name='ICM_title_abstract',
#     recommender='ItemKNNCFRecommender'
# )
cite_experiments_svd = get_experiment_results(
    dr_cite_results, ICM_name='ICM_title_abstract',
    recommender='PureSVDItemRecommender'
)
# cite_experiments_rp3b = get_experiment_results(
#     dr_cite_results, ICM_name='ICM_title_abstract',
#     recommender='RP3betaRecommender'
# )

0.1 100 -1.0 60
0.1 400 -1.0 80
0.1 100 -0.4 40
0.9 100 0.0 80
0.1 100 0.0 95
0.9 400 -0.5 40
0.1 100 -0.4 60
0.1 100 -1.5 80
0.3 400 0.5 30
0.3 100 -0.5 30
0.3 200 0.5 20
0.1 100 -0.4 95
0.1 100 -0.7 20
0.3 100 0.0 80
0.9 100 -0.5 80
0.1 100 -2.0 20
0.1 200 -1.0 40
0.9 100 -1.5 80
0.5 200 0.0 30
0.5 400 -1.5 60
0.9 200 -0.5 80
0.9 400 -0.5 95
0.1 400 -1.0 40
0.9 400 -1.5 80
0.3 100 -0.5 60
0.3 400 -1.5 60
0.9 400 0.5 60
0.1 100 -2.0 80
0.3 400 0.0 20
0.3 100 -0.5 80
0.3 400 -0.5 80
0.1 400 0.0 80
0.3 200 -1.5 30
0.1 400 -0.7 40
0.1 200 0.0 20
0.9 400 -0.5 30
0.3 400 0.0 40
0.9 100 -0.5 95
0.9 200 -0.5 60
0.9 400 0.0 95
0.1 200 0.0 30
0.1 200 -0.4 60
0.1 400 0.0 20
0.3 400 -0.5 95
0.1 200 -2.0 40
0.3 400 -1.5 30
0.3 200 0.5 95
0.9 100 0.5 95
0.3 400 -1.5 20
0.1 400 0.5 95
0.1 200 -1.5 30
0.3 400 0.0 80
0.1 400 -2.0 20
0.3 100 -1.5 40
0.1 100 0.0 60
0.5 400 0.5 30
0.9 100 -0.5 60
0.5 200 0.5 40
0.5 100 -1.5 30
0.3 100 -1.5 95
0.3 100 -0.5 95
0.9 200 -1.5 40
0.3 200 0.5 30
0.1 100 -0.7 8

In [35]:
cite_baselines

[{'path': '/mnt/bulky2/nsukhorukov/CQFS/results/CiteULike_a/ICM_title_abstract/popular/p020/ItemKNNCBFRecommender/ItemKNNCBFRecommender_ICM_title_abstract_cosine_SearchBayesianSkopt.txt',
  'name': 'popular 20%',
  'metrics': {'validation': {'PRECISION': 0.1134768,
    'RECALL': 0.3096767,
    'NDCG': 0.2372417,
    'MAP': 0.1613923,
    'COVERAGE_ITEM': 0.3219192,
    'DIVERSITY_GINI': 0.1700835,
    'DIVERSITY_MEAN_INTER_LIST': 0.9885168},
   'test': {5: {'PRECISION': 0.1706534,
     'RECALL': 0.1522832,
     'NDCG': 0.1596466,
     'MAP': 0.1387608,
     'COVERAGE_ITEM': 0.7832542,
     'DIVERSITY_GINI': 0.3452774,
     'DIVERSITY_MEAN_INTER_LIST': 0.9949173},
    10: {'PRECISION': 0.1387228,
     'RECALL': 0.234395,
     'NDCG': 0.2048676,
     'MAP': 0.1332815,
     'COVERAGE_ITEM': 0.9207245,
     'DIVERSITY_GINI': 0.4310629,
     'DIVERSITY_MEAN_INTER_LIST': 0.9925847},
    20: {'PRECISION': 0.1065707,
     'RECALL': 0.3357168,
     'NDCG': 0.250566,
     'MAP': 0.1397936,
     

In [36]:
import json, os
results_json = {
    'cite': {
        'baseline': cite_baselines + cite_baselines_artyom,
        # 'knn': cite_experiments_knn,
        'svd': cite_experiments_svd,
        # 'rp3b': cite_experiments_rp3b,
    }
}
with open('results_wpopular.json', 'w') as fout:
    json.dump(results_json, fout)

In [None]:
results_json

In [None]:
def style_negative(v, props=''):
    return 'color:red;' if v < 0 else 'color:green'

In [None]:
df_xing_paper_orig = pd.read_csv('xing_paper.csv', index_col=0)
df_xing_paper = df_xing_paper_orig.copy()
df_xing_paper.loc[df_xing_paper.index[1]:, metrics] = (
    df_xing_paper.loc[df_xing_paper.index[1]:, metrics] / df_xing_paper.loc['ItemKNN CBF', metrics] - 1
)
df_xing_paper_base = df_xing_paper.iloc[:1, :-2]
df_xing_paper_base.insert(0, 'Source', 'Reported')
display(df_xing_paper_base)
df_xing_paper = df_xing_paper.iloc[1:]
df_xing_paper[metrics] = (df_xing_paper[metrics] * 100).astype(float).round(1)
df_xing_paper = df_xing_paper.rename(columns=dict(zip(metrics, metrics_rel)))
df_xing_paper.insert(0, 'Selected, %', '-')
df_xing_paper.insert(0, 'Source', 'Reported')
df_xing_paper.loc[['CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%'], 'Selected, %'] = [
    int(np.round(np.mean(xing_selections_from_authors['a1b0001s1000p040']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b00001s10p060']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b0001s100p080']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b00001s1000p095']) * 100)),
]
display(df_xing_paper)


df_xing_paper_all = df_xing_paper_orig.copy()
df_xing_paper_all.insert(0, 'Selected, %', '-')
df_xing_paper_all.insert(0, 'Source', 'Reported')
df_xing_paper_all.loc[['CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%'], 'Selected, %'] = [
    int(np.round(np.mean(xing_selections_from_authors['a1b0001s1000p040']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b00001s10p060']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b0001s100p080']) * 100)),
    int(np.round(np.mean(xing_selections_from_authors['a1b00001s1000p095']) * 100)),
]
df_xing_paper_all

In [None]:
df_xing_baseline_test = parse_test_results(xing_baselines)
df_xing_cqfstt_test = parse_test_results(xing_experiments_knn)
df_xing_test = pd.concat([
    df_xing_baseline_test[df_xing_baseline_test['@n'] == 10],
    df_xing_cqfstt_test[df_xing_cqfstt_test['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_xing_test.loc[df_xing_test.index[1]:, metrics] = (
    df_xing_test.loc[df_xing_test.index[1]:, metrics] / df_xing_test.loc['ItemKNN CBF', metrics] - 1
)
display(df_xing_test)
# df_xing_test_base = df_xing_test.iloc[:1, 1:-2]
# df_xing_test_base.insert(0, 'Source', 'Replicated')
# display(df_xing_test_base)
# df_xing_test = df_xing_test.iloc[1:]
# df_xing_test[metrics] = (df_xing_test[metrics] * 100).astype(float).round(1)
# df_xing_test = df_xing_test.rename(columns={metric: metric + ', %' for metric in metrics})
# df_xing_test.insert(0, 'Source', 'Replicated')
# display(df_xing_test)
# df_xing_test.to_csv('xing_test_paper_params.csv')

df_xing_baseline_test_all = parse_test_results(xing_baselines)
df_xing_cqfstt_test_all = parse_test_results(xing_experiments_knn)
df_xing_test_all = pd.concat([
    df_xing_baseline_test_all[df_xing_baseline_test_all['@n'] == 10],
    df_xing_cqfstt_test_all[df_xing_cqfstt_test_all['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_xing_test_all.insert(0, 'Source', 'Reproduced')
df_xing_test_all

In [None]:
df_xing_baseline_test = parse_test_results(xing_baselines)
df_xing_cqfstt_knn_validation = parse_validation_results(xing_experiments_knn)
df_xing_cqfstt_svd_validation = parse_validation_results(xing_experiments_svd)
df_xing_cqfstt_rp3b_validation = parse_validation_results(xing_experiments_rp3b)
df_xing_cqfstt_knn_test = parse_test_results(xing_experiments_knn)
df_xing_cqfstt_svd_test = parse_test_results(xing_experiments_svd)
df_xing_cqfstt_rp3b_test = parse_test_results(xing_experiments_rp3b)

In [None]:
df_xing_test_orig

In [None]:
df_xing_test_orig.sort_values(by='Precision', ascending=False)

In [None]:
df_xing_baseline_test = parse_test_results(xing_baselines)
df_xing_cqfstt_knn_validation = parse_validation_results(xing_experiments_knn)
df_xing_cqfstt_svd_validation = parse_validation_results(xing_experiments_svd)
df_xing_cqfstt_rp3b_validation = parse_validation_results(xing_experiments_rp3b)
df_xing_cqfstt_knn_test = parse_test_results(xing_experiments_knn)
df_xing_cqfstt_svd_test = parse_test_results(xing_experiments_svd)
df_xing_cqfstt_rp3b_test = parse_test_results(xing_experiments_rp3b)
df_xing_test_orig = pd.concat([
    df_xing_baseline_test[df_xing_baseline_test['@n'] == 10],
    df_xing_cqfstt_knn_test[df_xing_cqfstt_knn_test['@n'] == 10],
    df_xing_cqfstt_svd_test[df_xing_cqfstt_svd_test['@n'] == 10],
    df_xing_cqfstt_rp3b_test[df_xing_cqfstt_rp3b_test['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_xing_test = df_xing_test_orig.copy()
df_xing_test.loc[df_xing_test.index[1]:, metrics] = (
    df_xing_test.loc[df_xing_test.index[1]:, metrics] / df_xing_test.loc['ItemKNN CBF', metrics] - 1
)
df_xing_test_base = df_xing_test.iloc[:1, 1:-2]
df_xing_test_base.insert(0, 'Source', 'Replicated')
display(df_xing_test_base)
df_xing_test = df_xing_test.iloc[1:]
df_xing_test[metrics] = (df_xing_test[metrics] * 100).astype(float).round(1)
df_xing_test = df_xing_test.rename(columns={metric: metric + ', %' for metric in metrics})
df_xing_test[['beta', 's']] = df_xing_test[['beta', 's']].astype(str)
df_xing_test.insert(0, 'Source', 'Replicated')
display(df_xing_test)
# df_xing_test.to_csv('xing_test_paper_params.csv')


df_xing_baseline_test_all = parse_test_results(xing_baselines)
df_xing_cqfstt_knn_test_all = parse_test_results(xing_experiments_knn)
df_xing_cqfstt_svd_test_all = parse_test_results(xing_experiments_svd)
df_xing_cqfstt_rp3b_test_all = parse_test_results(xing_experiments_rp3b)

df_xing_test_all = pd.concat([
    df_xing_baseline_test_all[df_xing_baseline_test_all['@n'] == 10],
    df_xing_cqfstt_knn_test_all[df_xing_cqfstt_knn_test_all['@n'] == 10],
    df_xing_cqfstt_svd_test_all[df_xing_cqfstt_svd_test_all['@n'] == 10],
    df_xing_cqfstt_rp3b_test_all[df_xing_cqfstt_rp3b_test_all['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_xing_test_all.insert(0, 'Source', 'Reproduced')
df_xing_test_all

In [None]:
df_xing_cqfstt_knn_validation_best = get_validation_best(df_xing_cqfstt_knn_validation, metric='Precision')
df_xing_cqfstt_svd_validation_best = get_validation_best(df_xing_cqfstt_svd_validation, metric='Precision')
df_xing_cqfstt_rp3b_validation_best = get_validation_best(df_xing_cqfstt_rp3b_validation, metric='Precision')

df_xing_cqfstt_knn_test_best = join_test_validation(
    df_xing_cqfstt_knn_validation_best,
    df_xing_test,
)
df_xing_cqfstt_svd_test_best = join_test_validation(
    df_xing_cqfstt_svd_validation_best,
    df_xing_test,
)
df_xing_cqfstt_rp3b_test_best = join_test_validation(
    df_xing_cqfstt_rp3b_validation_best,
    df_xing_test,
)
pd.concat([df_xing_cqfstt_knn_test_best, df_xing_cqfstt_svd_test_best, df_xing_cqfstt_rp3b_test_best])

In [None]:
df = pd.concat([
    df_xing_paper_all.set_index(['beta', 's', 'Source'], append=True),
    df_xing_test_all.set_index(['beta', 's', 'Source'], append=True),
]).sort_index(ascending=[False, False, False, True])
display(df)
df.to_csv('xing_complete.csv')

In [None]:
display(
    pd.concat([
        df_xing_paper_base,
        df_xing_test_base,
    ]).set_index(['Source'], append=True)
)

df_xing_paper_test_merged = pd.concat([
    df_xing_paper.set_index(['beta', 's', 'Source'], append=True),
    df_xing_test.set_index(['beta', 's', 'Source'], append=True),
])
df_xing_paper_test_merged = df_xing_paper_test_merged.sort_index().sort_index(ascending=[True, False, False, False])
df_xing_paper_test_merged

In [None]:
display(
    pd.concat([
        df_xing_paper_base,
        df_xing_test_base,
    ]).set_index(['Source'], append=True)
)

df_xing_paper_test_merged = pd.concat([
    df_xing_paper.set_index(['beta', 's', 'Source'], append=True),
    df_xing_test.set_index(['beta', 's', 'Source'], append=True),
])
df_xing_paper_test_merged = df_xing_paper_test_merged.sort_index().sort_index(ascending=[True, False, False, False])
df_xing_paper_test_merged

In [None]:
df_tmd_paper_orig = pd.read_csv('tmd_paper.csv', index_col=0)
df_tmd_paper = df_tmd_paper_orig.copy()
df_tmd_paper.loc[df_tmd_paper.index[1]:, metrics] = (
    df_tmd_paper.loc[df_tmd_paper.index[1]:, metrics] / df_tmd_paper.loc['ItemKNN CBF', metrics] - 1
)
df_tmd_paper_base = df_tmd_paper.iloc[:1, :-2]
df_tmd_paper_base.insert(0, 'Source', 'Reported')
display(df_tmd_paper_base)
df_tmd_paper = df_tmd_paper.iloc[1:]
df_tmd_paper[metrics] = (df_tmd_paper[metrics] * 100).astype(float).round(1)
df_tmd_paper = df_tmd_paper.rename(columns=dict(zip(metrics, metrics_rel)))
df_tmd_paper.insert(0, 'Selected, %', '-')
df_tmd_paper.insert(0, 'Source', 'Reported')
df_tmd_paper.loc[
    [
        'CQFS ItemKNN 20%', 'CQFS ItemKNN 30%', 'CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%',
        'CQFS PureSVD 20%', 'CQFS PureSVD 30%', 'CQFS PureSVD 40%', 'CQFS PureSVD 60%', 'CQFS PureSVD 80%', 'CQFS PureSVD 95%',
        'CQFS RP3Beta 20%', 'CQFS RP3Beta 30%', 'CQFS RP3Beta 40%', 'CQFS RP3Beta 60%', 'CQFS RP3Beta 80%', 'CQFS RP3Beta 95%',
    ],
    'Selected, %'
] = '-'
# ] = [
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s1000p040']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s10p060']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s100p080']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s1000p095']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s1000p040']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s10p060']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s100p080']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s1000p095']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s1000p040']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s10p060']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b0001s100p080']) * 100)),
#     int(np.round(np.mean(tmd_selections_from_authors['a1b00001s1000p095']) * 100)),
# ]
display(df_tmd_paper)


df_tmd_paper_all = df_tmd_paper_orig.copy()
df_tmd_paper_all.insert(0, 'Selected, %', '-')
df_tmd_paper_all.insert(0, 'Source', 'Reported')
df_tmd_paper_all.loc[
    [
        'CQFS ItemKNN 20%', 'CQFS ItemKNN 30%', 'CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%',
        'CQFS PureSVD 20%', 'CQFS PureSVD 30%', 'CQFS PureSVD 40%', 'CQFS PureSVD 60%', 'CQFS PureSVD 80%', 'CQFS PureSVD 95%',
        'CQFS RP3Beta 20%', 'CQFS RP3Beta 30%', 'CQFS RP3Beta 40%', 'CQFS RP3Beta 60%', 'CQFS RP3Beta 80%', 'CQFS RP3Beta 95%',
    ],
    'Selected, %'
] = '-'
# df_tmd_paper_all

In [None]:
df_tmd_baseline_test = parse_test_results(tmd_baselines)
df_tmd_cqfstt_knn_validation = parse_validation_results(tmd_experiments_knn)
df_tmd_cqfstt_svd_validation = parse_validation_results(tmd_experiments_svd)
df_tmd_cqfstt_rp3b_validation = parse_validation_results(tmd_experiments_rp3b)
df_tmd_cqfstt_knn_test = parse_test_results(tmd_experiments_knn)
df_tmd_cqfstt_svd_test = parse_test_results(tmd_experiments_svd)
df_tmd_cqfstt_rp3b_test = parse_test_results(tmd_experiments_rp3b)
df_tmd_test_orig = pd.concat([
    df_tmd_baseline_test[df_tmd_baseline_test['@n'] == 10],
    df_tmd_cqfstt_knn_test[df_tmd_cqfstt_knn_test['@n'] == 10],
    df_tmd_cqfstt_svd_test[df_tmd_cqfstt_svd_test['@n'] == 10],
    df_tmd_cqfstt_rp3b_test[df_tmd_cqfstt_rp3b_test['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_tmd_test = df_tmd_test_orig.copy()
df_tmd_test.loc[df_tmd_test.index[1]:, metrics] = (
    df_tmd_test.loc[df_tmd_test.index[1]:, metrics] / df_tmd_test.loc['ItemKNN CBF', metrics] - 1
)
df_tmd_test_base = df_tmd_test.iloc[:1, 1:-2]
df_tmd_test_base.insert(0, 'Source', 'Replicated')
display(df_tmd_test_base)
df_tmd_test = df_tmd_test.iloc[1:]
df_tmd_test[metrics] = (df_tmd_test[metrics] * 100).astype(float).round(1)
df_tmd_test = df_tmd_test.rename(columns={metric: metric + ', %' for metric in metrics})
df_tmd_test[['beta', 's']] = df_tmd_test[['beta', 's']].astype(str)
df_tmd_test.insert(0, 'Source', 'Replicated')
display(df_tmd_test)
# df_tmd_test.to_csv('tmd_test_paper_params.csv')


df_tmd_baseline_test_all = parse_test_results(tmd_baselines)
df_tmd_cqfstt_knn_test_all = parse_test_results(tmd_experiments_knn)
df_tmd_cqfstt_svd_test_all = parse_test_results(tmd_experiments_svd)
df_tmd_cqfstt_rp3b_test_all = parse_test_results(tmd_experiments_rp3b)

df_tmd_test_all = pd.concat([
    df_tmd_baseline_test_all[df_tmd_baseline_test_all['@n'] == 10],
    df_tmd_cqfstt_knn_test_all[df_tmd_cqfstt_knn_test_all['@n'] == 10],
    df_tmd_cqfstt_svd_test_all[df_tmd_cqfstt_svd_test_all['@n'] == 10],
    df_tmd_cqfstt_rp3b_test_all[df_tmd_cqfstt_rp3b_test_all['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_tmd_test_all.insert(0, 'Source', 'Reproduced')
df_tmd_test_all

In [None]:
df_tmd_cqfstt_knn_validation_best = get_validation_best(df_tmd_cqfstt_knn_validation, metric='Precision')
df_tmd_cqfstt_svd_validation_best = get_validation_best(df_tmd_cqfstt_svd_validation, metric='Precision')
df_tmd_cqfstt_rp3b_validation_best = get_validation_best(df_tmd_cqfstt_rp3b_validation, metric='Precision')

df_tmd_cqfstt_knn_test_best = join_test_validation(
    df_tmd_cqfstt_knn_validation_best,
    df_tmd_test,
)
df_tmd_cqfstt_svd_test_best = join_test_validation(
    df_tmd_cqfstt_svd_validation_best,
    df_tmd_test,
)
df_tmd_cqfstt_rp3b_test_best = join_test_validation(
    df_tmd_cqfstt_rp3b_validation_best,
    df_tmd_test,
)
pd.concat([df_tmd_cqfstt_knn_test_best, df_tmd_cqfstt_svd_test_best, df_tmd_cqfstt_rp3b_test_best])

In [None]:
df = pd.concat([
    df_tmd_paper_all.set_index(['beta', 's', 'Source'], append=True),
    df_tmd_test_all.set_index(['beta', 's', 'Source'], append=True),
]).sort_index(ascending=[False, False, False, True])
display(df)
df.to_csv('tmd_complete.csv')

In [None]:
display(
    pd.concat([
        df_tmd_paper_base,
        df_tmd_test_base,
    ]).set_index(['Source'], append=True)
)

df_tmd_paper_test_merged = pd.concat([
    df_tmd_paper.set_index(['beta', 's', 'Source'], append=True),
    df_tmd_test.set_index(['beta', 's', 'Source'], append=True),
])
# df_tmd_paper_test_merged.sort_index().iloc[25:].index
df_tmd_paper_test_merged = df_tmd_paper_test_merged.sort_index(ascending=[True, False, False, False])
df_tmd_paper_test_merged

In [None]:
df_cite_paper_orig = pd.read_csv('cite_paper.csv', index_col=0)
df_cite_paper = df_cite_paper_orig.copy()
df_cite_paper.loc[df_cite_paper.index[1]:, metrics] = (
    df_cite_paper.loc[df_cite_paper.index[1]:, metrics] / df_cite_paper.loc['ItemKNN CBF', metrics] - 1
)
df_cite_paper_base = df_cite_paper.iloc[:1, :-2]
df_cite_paper_base.insert(0, 'Source', 'Reported')
display(df_cite_paper_base)
df_cite_paper = df_cite_paper.iloc[1:]
df_cite_paper[metrics] = (df_cite_paper[metrics] * 100).astype(float).round(1)
df_cite_paper = df_cite_paper.rename(columns=dict(zip(metrics, metrics_rel)))
df_cite_paper.insert(0, 'Selected, %', '-')
df_cite_paper.insert(0, 'Source', 'Reported')
df_cite_paper.loc[
    ['CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%'],
    'Selected, %'
] = '-'
# ] = [
#     int(np.round(np.mean(cite_selections_from_authors['a1b0001s1000p040']) * 100)),
#     int(np.round(np.mean(cite_selections_from_authors['a1b00001s10p060']) * 100)),
#     int(np.round(np.mean(cite_selections_from_authors['a1b0001s100p080']) * 100)),
#     int(np.round(np.mean(cite_selections_from_authors['a1b00001s1000p095']) * 100)),
# ]
display(df_cite_paper)


df_cite_paper_all = df_cite_paper_orig.copy()
df_cite_paper_all.insert(0, 'Selected, %', '-')
df_cite_paper_all.insert(0, 'Source', 'Reported')
df_cite_paper_all.loc[
    ['CQFS ItemKNN 40%', 'CQFS ItemKNN 60%', 'CQFS ItemKNN 80%', 'CQFS ItemKNN 95%'],
    'Selected, %'
] = '-'
df_cite_paper_all

In [None]:
df_cite_baseline_test = parse_test_results(cite_baselines)
df_cite_cqfstt_test_knn = parse_test_results(cite_experiments_knn)
df_cite_cqfstt_test_svd = parse_test_results(cite_experiments_svd)
df_cite_cqfstt_test_rp3b = parse_test_results(cite_experiments_rp3b)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
def get_validation_best(df_cqfstt, metric):
    return df_cqfstt[df_cqfstt.groupby(df_cqfstt.index)[metric].transform(max) == df_cqfstt[metric]]

def join_test_validation(df_vali, df_test):
    df_vali = df_vali.copy()
    df_test = df_test.copy()
    df_vali['alg'] = df_vali.index
    df_test['alg'] = df_test.index
    df_vali = df_vali.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    df_test.loc[df_test['beta'] == '-', 'beta'] = 0
    df_test.loc[:, 'beta'] = df_test['beta'].astype(np.float64)
    df_test.loc[df_test['s'] == '-', 's'] = 0
    df_test.loc[:, 's'] = df_test['s'].astype(np.int64)
    tmp = pd.merge(df_test, df_vali, on=['alg', 'beta', 's'], suffixes=('', '_vali'))
    tmp = tmp[df_test.columns]
    tmp.index = list(tmp['alg'])
    tmp = tmp.loc[:, [col for col in df_test.columns if col != 'alg']]
    return tmp

In [None]:
df_cite_baseline_test = parse_test_results(cite_baselines)
df_cite_cqfstt_knn_test = parse_test_results(cite_experiments_knn)
df_cite_cqfstt_svd_test = parse_test_results(cite_experiments_svd)
df_cite_cqfstt_rp3b_test = parse_test_results(cite_experiments_rp3b)
df_cite_cqfstt_knn_validation = parse_validation_results(cite_experiments_knn)
df_cite_cqfstt_svd_validation = parse_validation_results(cite_experiments_svd)
df_cite_cqfstt_rp3b_validation = parse_validation_results(cite_experiments_rp3b)

df_cite_test_orig = pd.concat([
    df_cite_baseline_test[df_cite_baseline_test['@n'] == 10],
    df_cite_cqfstt_knn_test[df_cite_cqfstt_knn_test['@n'] == 10],
    df_cite_cqfstt_svd_test[df_cite_cqfstt_svd_test['@n'] == 10],
    df_cite_cqfstt_rp3b_test[df_cite_cqfstt_rp3b_test['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_cite_test = df_cite_test_orig.copy()
df_cite_test.loc[df_cite_test.index[1]:, metrics] = (
    df_cite_test.loc[df_cite_test.index[1]:, metrics] / df_cite_test.loc['ItemKNN CBF', metrics] - 1
)
df_cite_test_base = df_cite_test.iloc[:1, 1:-2]
df_cite_test_base.insert(0, 'Source', 'Replicated')
display(df_cite_test_base)
df_cite_test = df_cite_test.iloc[1:]
df_cite_test[metrics] = (df_cite_test[metrics] * 100).astype(float).round(1)
df_cite_test = df_cite_test.rename(columns={metric: metric + ', %' for metric in metrics})
df_cite_test.insert(0, 'Source', 'Replicated')
display(df_cite_test)
# df_cite_test.to_csv('cite_test_paper_params.csv')


df_cite_baseline_test_all = parse_test_results(cite_baselines)
df_cite_cqfstt_knn_test_all = parse_test_results(cite_experiments_knn)
df_cite_cqfstt_svd_test_all = parse_test_results(cite_experiments_svd)
df_cite_cqfstt_rp3b_test_all = parse_test_results(cite_experiments_rp3b)

df_cite_test_all = pd.concat([
    df_cite_baseline_test_all[df_cite_baseline_test_all['@n'] == 10],
    df_cite_cqfstt_knn_test_all[df_cite_cqfstt_knn_test_all['@n'] == 10],
    df_cite_cqfstt_svd_test_all[df_cite_cqfstt_svd_test_all['@n'] == 10],
    df_cite_cqfstt_rp3b_test_all[df_cite_cqfstt_rp3b_test_all['@n'] == 10],
]).drop(columns=['@n', 'a'])
df_cite_test_all.insert(0, 'Source', 'Reproduced')
df_cite_test_all

In [None]:
df_cite_cqfstt_knn_validation_best = get_validation_best(df_cite_cqfstt_knn_validation, metric='Precision')
df_cite_cqfstt_svd_validation_best = get_validation_best(df_cite_cqfstt_svd_validation, metric='Precision')
df_cite_cqfstt_rp3b_validation_best = get_validation_best(df_cite_cqfstt_rp3b_validation, metric='Precision')

df_cite_cqfstt_knn_test_best = join_test_validation(
    df_cite_cqfstt_knn_validation_best,
    df_cite_test,
)
df_cite_cqfstt_svd_test_best = join_test_validation(
    df_cite_cqfstt_svd_validation_best,
    df_cite_test,
)
df_cite_cqfstt_rp3b_test_best = join_test_validation(
    df_cite_cqfstt_rp3b_validation_best,
    df_cite_test,
)
pd.concat([df_cite_cqfstt_knn_test_best, df_cite_cqfstt_svd_test_best, df_cite_cqfstt_rp3b_test_best])

In [None]:
df.groupby(by=list(set(df.columns).difference(['beta', 's'])))

In [None]:
df = pd.concat([
    df_cite_paper_all.set_index(['beta', 's', 'Source'], append=True),
    df_cite_test_all.set_index(['beta', 's', 'Source'], append=True),
]).sort_index(ascending=[False, False, False, True])
display(df)
df.to_csv('cite_complete.csv')

In [None]:
display(
    pd.concat([
        df_cite_paper_base,
        df_cite_test_base,
    ]).set_index(['Source'], append=True)
)

df_cite_paper_test_merged = pd.concat([
    df_cite_paper.set_index(['beta', 's', 'Source'], append=True),
    df_cite_test.set_index(['beta', 's', 'Source'], append=True),
])
df_cite_paper_test_merged.sort_index().sort_index(ascending=[True, False, False, False])

In [None]:
import pandas as pd

def parse_test_results(experiments):
    dfs = []
    
    map_columns = {
        'PRECISION': 'Precision',
        'RECALL': 'Recall',
        'NDCG': 'NDCG',
        'MAP': 'MAP',
        'COVERAGE_ITEM': 'I. Cov.',
        'DIVERSITY_GINI': 'Gini',
        'DIVERSITY_MEAN_INTER_LIST': 'MIL',
    }
    metrics = list(map_columns.values())
    
    for experiment in experiments:
        df = pd.DataFrame(experiment['metrics']['test'].values())
        df.index = [experiment['name']] * len(df)
        df['@n'] = experiment['metrics']['test'].keys()
        df['a'] = experiment.get('alpha', '-')
        df['beta'] = experiment.get('beta', '-')
        df['s'] = experiment.get('s', '-')
        df['Selected, %'] = experiment.get('%', '-')
        columns = list(df.columns[-1:]) + list(df.columns[:-1])
        df = df[columns]
        df = df.rename(columns=map_columns)
        dfs.append(df)
    
    df = pd.concat(dfs)
    df[metrics] = df[metrics].astype(float)
    return df

def parse_validation_results(experiments):
    dfs = []
    
    map_columns = {
        'PRECISION': 'Precision',
        'RECALL': 'Recall',
        'NDCG': 'NDCG',
        'MAP': 'MAP',
        'COVERAGE_ITEM': 'I. Cov.',
        'DIVERSITY_GINI': 'Gini',
        'DIVERSITY_MEAN_INTER_LIST': 'MIL',
    }
    metrics = list(map_columns.values())
    
    for experiment in experiments:
        df = pd.DataFrame([experiment['metrics']['validation']])
        df['a'] = experiment.get('alpha', '-')
        df['beta'] = experiment.get('beta', '-')
        df['s'] = experiment.get('s', '-')
        df['Selected, %'] = experiment.get('%', '-')
        columns = list(df.columns[-1:]) + list(df.columns[:-1])
        df = df[columns]
        df.index = [experiment['name']] * len(df)
        map_columns = {
            'PRECISION': 'Precision',
            'RECALL': 'Recall',
            'NDCG': 'NDCG',
            'MAP': 'MAP',
            'COVERAGE_ITEM': 'I. Cov.',
            'DIVERSITY_GINI': 'Gini',
            'DIVERSITY_MEAN_INTER_LIST': 'MIL',
        }
        metrics = list(map_columns.values())
        df = df.rename(columns=map_columns)
        df[metrics] = df[metrics].astype(float)
        dfs.append(df)
    
    df = pd.concat(dfs)
    df[metrics] = df[metrics].astype(float)
    return df

In [None]:
metrics = ['Precision', 'Recall', 'NDCG', 'MAP', 'I. Cov.', 'Gini', 'MIL']
metrics_rel = [metric + ', %' for metric in metrics]

In [None]:
parse_validation_results(cite_experiments)

In [None]:
df_test = parse_test_results(cite_experiments)

In [None]:
df_test

In [None]:
df_validation = parse_validation_results(experiments)

In [None]:
df_validation.astype(str)

In [None]:
df_test[df_test['@n'] == 10].drop(columns=['@n'])

In [None]:
df_validation.loc[['ItemKNN CBF', 'CQFSTT 65%', 'Random 65%']]

In [None]:
df_test[df_test['@n']==10].loc[['ItemKNN CBF', 'CQFSTT 65%', 'Random 65%']].drop(columns='@n')

In [None]:
df_test[df_test['@n'] == 10]

In [None]:
np.round(10**np.linspace(-3, -2, 5, endpoint=False)[1:], 5)

In [None]:
import scipy as sp
import numpy as np
from scipy.stats import ortho_group

In [None]:
A = np.array([[2,0, 0], [0, 2, 0], [0, 0, -1]])
e = np.zeros(3)
e[0] = 1.0

vals = []
for i in range(10000):
    er = ortho_group.rvs(3) @ e
    vals.append(er.T @ A @ er)

In [None]:
import re
from ipypb import track
import scipy as sp
import scipy.sparse
from zipfile import ZipFile

def get_fpms(dr): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+$', file)
    ]
    return {
        file: np.load(os.path.join(dr, file, 'FPM.zip'))['FPM']
        for file in files
    }


def get_fpms_ke(dr): 
    data = []
    for file in os.listdir(dr):
        match = re.match('^a\d+b(0*)(\d+)', file)
        if not match:
            continue
        data.append([file, float(match.group(2)) * 10 ** len(match.group(1))])

    dataIO = DataIO(dr)
    FPM_K = dataIO.load_data('FPM_K.zip')['FPM_K'].astype(np.float64)
    FPM_E = dataIO.load_data('FPM_E.zip')['FPM_E'].astype(np.float64)

    return {
        file: FPM_K + FPM_E * beta
        for file, beta in data
    }


def get_selections(dr): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+s\d+p\d+$', file)
    ]
    return {
        file: np.load(os.path.join(dr, file, 'cqfs_tt', 'cqfs_tt.zip'))['selection']
        for file in files
    }


def get_selections_from_authors(dr, topology='qpu_pegasus'): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+s\d+p\d+$', file)
    ]
    return {
        file: np.load(os.path.join(dr, file, topology, f'{ topology }.zip'))['selection']
        for file in files
    }


def get_timings(dr): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+s\d+p\d+$', file)
    ]
    return {
        file: float(np.load(os.path.join(dr, file, 'cqfs_tt', 'timings.zip'))['response_time.json'])
        for file in files
    }


def get_bqms_from_authors(dr): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+s\d+p\d+$', file)
    ]
    return {
        file: np.load(os.path.join(dr, file, 'BQM.zip'))['BQM']
        for file in files
    }


def get_fpms_from_authors(dr): 
    files = [
        file
        for file in os.listdir(dr)
        if re.match('^a\d+b\d+s\d+p\d+$', file)
    ]
    ret = {}
    for file in files:
        path = os.path.join(dr, file, 'FPM.zip')
        with ZipFile(path) as fzip:
            with fzip.open('FPM.npz') as fin:
                ret[file] = sp.sparse.load_npz(fin).todense()
    return ret    

In [None]:
np.mean(list(get_timings('/mnt/bulky/anikitin/Developer/CQFS/results/XingChallenge2017/ICM_all/ItemKNNCFRecommender/').values()))

In [None]:
xing_timings_knn = get_timings(os.path.join(dr_xing_results, 'ItemKNNCFRecommender'))
tmd_timings_knn = get_timings(os.path.join(dr_tmd_results, 'ItemKNNCFRecommender'))
cite_timings_knn = get_timings(os.path.join(dr_cite_results, 'ItemKNNCFRecommender'))

In [None]:
# dr_fpm_knn = os.path.join(dr_results, 'ItemKNNCFRecommender/')
# dr_tmd_results_knn = os.path.join(dr_tmd_results, 'ItemKNNCFRecommender/')
dr_cite_results_knn = os.path.join(dr_cite_results, 'ItemKNNCFRecommender/')

In [None]:
import tqdm

In [None]:
frac_pos = []
a = 1
bs = np.linspace(0, -6, 100)
for val in tqdm.tqdm(bs):
    FPM = a * FPM_K + 10 ** val * FPM_E
    frac_pos.append(np.sum(FPM[FPM > 0]) / np.sum(np.abs(FPM)))
frac_pos = np.array(frac_pos)

In [None]:
FPM = a * FPM_K + 0 * FPM_E

In [None]:
np.sum(FPM > 0) / np.sum(FPM != 0), np.sum(FPM < 0) / np.sum(FPM != 0), 

In [None]:
plt.plot(10**bs, frac_pos)
plt.title('IPM = K + bE', fontsize=14)
plt.ylabel('sum(pos) / sum(abs(all))', fontsize=14)
plt.xlabel('b', fontsize=14)
plt.xscale('log')
plt.show()

In [None]:
dr_authors = os.path.join(dr_results, '../from_authors')

In [None]:
bqms_from_authors = get_bqms_from_authors(dr_authors)
selections_from_authors = get_selections_from_authors(dr_authors)

In [None]:
np.sum(fpms_knn['a1b0001'] > 0) / len(fpms_knn['a1b0001'])**2

In [None]:
bqm_authors_k = bqms_from_authors['a1b0001s1000p040']
fpm_ours = fpms_knn['a1b0001']

In [None]:
def reldiffnorm(x1, x2):
    return np.linalg.norm(x1-x2) / np.linalg.norm(x1)

def bqm_from_fpm(fpm, *, k, s):
    F = len(fpm)
    fpm_diag = np.diag(np.diag(fpm))
    fpm_other = fpm - fpm_diag
    bqm = fpm_diag + np.triu(fpm_other + fpm_other.T)
    bqm = bqm - 2 * k * s * np.eye(F) + s * (2 * np.triu(np.ones((F, F))) - np.eye(F))
    inv_scalar = max(-bqm.min(), bqm.max())
    bqm /= inv_scalar
    return bqm

In [None]:
import dimod
p = 0.4
s = 1000
vartype = 'BINARY'

fpm = fpms_knn['a1b0001']
F = len(fpm)
k = p * F

fpm_dimod = dimod.as_bqm(fpm, vartype)
bqm_dimod = dimod.generators.combinations(F, k=k, strength=s, vartype=vartype)
bqm_dimod = dimod.AdjVectorBQM(bqm_dimod)
bqm_dimod.update(fpm_dimod)
bqm_dimod.normalize()

bqm = bqm_from_fpm(fpm, k=k, s=s)

assert np.isclose(bqm, bqm_dimod.to_numpy_matrix()).all()

In [None]:
np.linalg.norm(fpms_knn['a1b0001']), np.linalg.norm(fpms_from_authors['a1b0001s1000p040'])

In [None]:
{
    k: np.sum(v > 0) / (len(v) ** 2) * 100
    for k,v in bqms_from_authors.items()
}

In [None]:
np.sum(fpms_from_authors['a1b0001s1000p040'] > 0) / fpms_from_authors['a1b0001s1000p040'].shape[0] ** 2

In [None]:
np.sum(fpms_from_authors['a1b0001s1000p040'] > 0) / fpms_from_authors['a1b0001s1000p040'].shape[0] ** 2

In [None]:
fpms_from_authors

In [None]:
{
    k: int((sum(v) / len(v)) * 100)
    for k,v in selections_from_authors.items()
}

In [None]:
dr_fpm_knn = os.path.join(dr_results, 'ItemKNNCFRecommender/')
dr_tmd_results_knn = os.path.join(dr_tmd_results, 'ItemKNNCFRecommender/')

In [None]:
dr_tmd_results_knn

In [None]:
fpms_knn = get_fpms(dr_fpm_knn)

In [None]:
pos_real = []
neg_real = []
pos_imag = []
neg_imag =[]
for fpm in fpms_knn:
    eigs = np.linalg.eigvals(fpm)

    signs = np.sign(eigs.real)
    pos_real.append(np.sum(signs[signs == 1]))
    neg_real.append(-np.sum(signs[signs == -1]))

    signs = np.sign(eigs.imag)
    pos_imag.append(np.sum(signs[signs == 1]))
    neg_imag.append(-np.sum(signs[signs == -1]))

plt.plot(pos_real, '-o', color='orange')
plt.plot(neg_real, '-o', color='blue')
plt.show()

In [None]:
df_paper = pd.read_csv('paper.csv', index_col=0)

In [None]:
df_paper

In [None]:
df_validation

In [None]:
df_test[df_test['@n'] == 10]

In [None]:
df_test[df_test['@n'] == 10][metrics] - df_paper

In [None]:
selections['a1b000206s0p000']

In [None]:
f'{12:03d}'

In [None]:
selections = get_selections(dr_fpm_knn)

for beta in [0.00206, 0.00231, 0.00292, 0.00334]:
    beta = str(beta).replace('.', '')
    count = sum(selections[f'a1b{beta}s0p000'])
    p = int(np.round(count * 100 / len(FPM)))
    selection_data_io = DataIO(
        os.path.join(dr_fpm_knn, f'a1b{beta}s0p{p:03d}', 'cqfs_tt/')
    )
    
    selection_random = np.zeros(len(FPM), dtype=bool)
    selection_random[np.random.choice(len(FPM), count, replace=False)] = True
    selection_data_io.save_data('cqfs_tt', {'selection': selection_random})

    new_selections = get_selections(dr_fpm_knn)
    assert all(new_selections[f'a1b{beta}s0p{p:03d}'] == selection_random)

In [None]:
sum(selections_knn['a1b000251s0p065'])

In [None]:
sum(selections_knn['a1b000251s0p000']) / len(FPM) * 100

In [None]:
import tntorch as tn
import torch
import operator
torch.set_default_dtype(torch.float64)
DTYPE = torch.float64
DEVICE = torch.device('cpu')


class CQFSTTSampler:
    _tol: float

    def __init__(self, tol: float = 1e-4):
        self._tol = tol

    def _optimize(self, W, s, k, eps=1e-6):
        N = W.shape[0]
        assert W.shape[1] == N

        # First term: x @ W @ x^T
        ts = []
        for n in range(N):
            c = torch.zeros([1, 2, 1], dtype=DTYPE)
            c[0, 1, 0] = 1
            t = tn.Tensor([c])[:, None]
            t.cores[1] = t.cores[1].repeat(1, N, 1)
            t.cores[1][:, :n, :] = 0
            t.cores[1][:, n + 1:, :] = 0
            t = tn.unsqueeze(t, list(range(n)) + list(range(n + 1, N)))
            t = t.repeat(*[2] * n, 1, *[2] * (N - n - 1), 1)
            ts.append(t)
        t = sum(ts)
        ts = []
        for n in range(N):
            trow = t.clone()
            trow.cores[-1] = torch.einsum('ijl,jk->ikl', trow.cores[-1], W[:, n:n + 1])
            ts.append(trow)
        term1 = tn.reduce(ts, operator.add, eps=eps)[..., 0]

        # Second term
        c = torch.eye(2, 2, dtype=DTYPE)[:, None, :].repeat(1, 2, 1)
        c[1, 0, 0] = (-k) ** 2
        c[1, 1, 0] = (1 - k) ** 2
        xs = tn.Tensor([c[-1:, :, :]] + [c] * (N - 2) + [c[:, :, 0:1]])
        term2 = s * tn.round_tt(xs, eps=eps)

        # Minimize tensor
        target = tn.round_tt(term1 + term2, eps=eps)
        return tn.argmin(target, rmax=40, max_iter=10, verbose=True)

    def sample(self, *, FPM, s, k):
        FPM = torch.tensor(FPM, dtype=DTYPE)
        f = self._optimize(FPM, s, k)
        return {
            i: f[i]
            for i in range(len(f))
        }

In [None]:
sampler = CQFSTTSampler()

In [None]:
percentages = [40, 60, 80, 95]
alphas = [1]
# betas = [1, 1e-1, 1e-2, 1e-3, 1e-4]
betas = [0.00257]
combination_strengths = [1, 10, 100, 1000, 10000]

In [None]:
selection = {0: 1, 1: 1, 2: 0, 3: 1, 4: 0, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 0, 11: 0, 12: 0, 13: 0, 14: 1, 15: 1, 16: 1, 17: 0, 18: 1, 19: 1, 20: 0, 21: 0, 22: 1, 23: 1, 24: 0, 25: 1, 26: 0, 27: 1, 28: 1, 29: 0, 30: 1, 31: 1, 32: 1, 33: 1, 34: 0, 35: 1, 36: 1, 37: 0, 38: 1, 39: 1, 40: 1, 41: 0, 42: 1, 43: 0, 44: 0, 45: 1, 46: 1, 47: 1, 48: 1, 49: 0, 50: 1, 51: 0, 52: 1, 53: 0, 54: 0, 55: 0, 56: 0, 57: 1, 58: 0, 59: 0, 60: 0, 61: 1, 62: 0, 63: 1, 64: 1, 65: 1, 66: 1, 67: 0, 68: 0, 69: 1, 70: 1, 71: 1, 72: 0, 73: 0, 74: 1, 75: 1, 76: 1, 77: 1, 78: 1}

In [None]:
from itertools import product

results = {}
for p, a, b, s in product(percentages, alphas, betas, combination_strengths):
    k = p * len(fpm) / 100
    key = 'a%sb%s' % (a, str(b).replace('.', ''))
    FPM = a * FPM_K + b * FPM_E
#     FPM = fpms_knn[key]
    selection = sampler.sample(FPM=FPM, s=s, k=k)
    results[(p, a, b, s)] = selection
    print(selection)

In [None]:
np.linalg.eig(fpms_knn['a1b1'])

In [None]:
r = np.random.randint(0, 2, size=len(fpms_knn['a1b1']))
r.T.dot(fpms_knn['a1b1']).dot(r)

In [None]:
eigs = np.linalg.eig(fpms_knn['a1b1'])[0]

In [None]:
plt.plot(np.linalg.svd(fpms_knn['a1b1'])[1])

In [None]:
eigs.real.round(3)

In [None]:
plt.plot(np.linalg.eig(fpms_knn['a1b1'])[0])

In [None]:
sampler.sample(FPM=fpms_knn['a1b1'], s=0, k=0)

In [None]:
import tntorch as tn
import torch
import operator
torch.set_default_dtype(torch.float64)
DTYPE = torch.float64

In [None]:
def optimize(W, eps=1e-6):
    N = W.shape[0]
    assert W.shape[1] == N

    # First term: x @ W @ x^T
    ts = []
    for n in range(N):
        c = torch.zeros([1, 2, 1])
        c[0, 1, 0] = 1
        t = tn.Tensor([c])[:, None]
        t.cores[1] = t.cores[1].repeat(1, N, 1)
        t.cores[1][:, :n, :] = 0
        t.cores[1][:, n + 1:, :] = 0
        t = tn.unsqueeze(t, list(range(n)) + list(range(n + 1, N)))
        t = t.repeat(*[2] * n, 1, *[2] * (N - n - 1), 1)
        ts.append(t)
    t = sum(ts)
    ts = []
    for n in range(N):
        trow = t.clone()
        trow.cores[-1] = torch.einsum('ijl,jk->ikl', trow.cores[-1], W[:, n:n+1])
        ts.append(trow)
    term1 = tn.reduce(ts, operator.add, eps=eps)[..., 0]
    
    # Minimize tensor
    target = tn.round_tt(term1, eps=eps)
    return tn.argmin(target, rmax=100, verbose=True)

In [None]:
FPM = FPM_K + 1e-4 * FPM_E
BQM = bqm_from_fpm(FPM, k=0.6 * len(FPM), s=1e2)

In [None]:
plt.plot(s_E)
plt.yscale('log')
plt.show()

In [None]:
plt.plot(s_K)
plt.yscale('log')
plt.show()

In [None]:
s[0]/s[-10]

In [None]:
plt.plot(s)
# plt.xlim([-10, 100])
plt.show()

In [None]:
selection = optimize(torch.Tensor(BQM))

In [None]:
t.to(device=DEVICE).cores

In [None]:
ns = [10, 20, 30, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
A = torch.randn(ns[-1], ns[-1])

In [None]:
from time import monotonic

times = []
for n in track(ns):
    t = -monotonic()
    optimize(A[:n, :n], 0, 0)
    t += monotonic()
    print(f'{n}: {t/60:.2f} minutes elapsed' )
    times.append(t)

In [None]:
r1 = np.random.randn(20)
r2 = np.random.randn(20)
r1[0] = 1e5
r2[0] = 1e5
A = np.outer(r1, r2) + 1e-3 * np.random.randn(20, 20)

In [None]:
A = torch.tensor(A, dtype=torch.float64)

In [None]:
optimize(A, 0, 0)

In [None]:
for key, fpm in sorted(fpms_knn.items(), key=lambda x: x[0], reverse=True):
    print(key + ':', np.sum(fpm > 0) / fpm.size)

In [None]:
from data.DataLoader import TheMoviesDatasetLoader, TheMoviesDatasetReader, CiteULike_aLoader, CiteULike_aReader
from recsys.Data_manager.DataSplitter_Cold_items import DataSplitter_Cold_items
from recsys.Base.DataIO import DataIO

data_loader = CiteULike_aLoader()
data_loader.load_data()

ICM_cite = data_loader.get_ICM_train_from_name('ICM_title_abstract')

In [None]:
dataIO = DataIO(dr_cite_results_from_authors + '/')
K_cite = dataIO.load_data('K.zip')['K'].astype(np.float64)
E_cite = dataIO.load_data('E.zip')['E'].astype(np.float64)

In [None]:
FPM_K_cite = ICM.T @ K_cite @ ICM
FPM_E_cite = ICM.T @ E_cite @ ICM

In [None]:
dataIO.save_data('FPM_K', {'FPM_K': FPM_K_cite.todense()})
dataIO.save_data('FPM_E', {'FPM_E': FPM_E_cite.todense()})

In [None]:
from data.DataLoader import XingChallenge2017Loader
data_loader = XingChallenge2017Loader()
data_loader.load_data()

ICM = data_loader.get_ICM_train_from_name('ICM_all').astype(np.float64)

In [None]:
from recsys.Base.DataIO import DataIO
base_folder_path = "results/XingChallenge2017/ICM_all/ItemKNNCFRecommender/"

In [None]:

dataIO = DataIO(base_folder_path)

In [None]:
FPM_K = dataIO.load_data('FPM_K.zip')['FPM_K'].astype(np.float64)
FPM_E = dataIO.load_data('FPM_E.zip')['FPM_E'].astype(np.float64)

In [None]:
K = dataIO.load_data('K.zip')['K'].astype(np.float64)
E = dataIO.load_data('E.zip')['E'].astype(np.float64)

In [None]:
K.nnz, E.nnz, K.nnz / E.nnz

In [None]:
K.dtype, E.dtype, ICM.dtype

In [None]:
a = 1
b = np.round(K.nnz / E.nnz, 3)

In [None]:
IPM = K + E
IPM.eliminate_zeros()
IFPM = IPM * ICM
FPM_paper = ICM.T * IFPM
FPM_paper.eliminate_zeros()

In [None]:
FPM_paper_dense = np.array(FPM_paper.todense())

In [None]:
FPM_K = ICM.T @ (K @ ICM)
FPM_E = ICM.T @ (E @ ICM)

FPM_K = np.array(FPM_K.todense())
FPM_E = np.array(FPM_E.todense())

In [None]:
K.nnz

In [None]:
E.nnz / E.shape[0]**2 * 100

In [None]:
K.nnz / K.shape[0]**2 * 100

In [None]:
E.size * 4 / 1024 / 1024 / 1024

In [None]:
FPM_KE = 1.0 * FPM_K + 1.0 * FPM_E

In [None]:
FPM = fpms_knn['a1b1']

In [None]:
FPM = a * FPM_K + b * FPM_E

In [None]:
import matplotlib.pyplot as plt

In [None]:
FPM = a * FPM_K + 10 ** bs[-1] * FPM_E

In [None]:
np.sum(FPM > 0)

In [None]:
for key, fpm in sorted(fpms_knn.items(), key=lambda x: x[0], reverse=True):
    print(key + ':', np.sum(fpm > 0) / fpm.size)

In [None]:
np.sum(FPM_paper > 0) / FPM_paper.size

In [None]:
for b in [1, 1e-1, 1e-2, 1e-3, 1e-4]:
    FPM = a * FPM_K + b * FPM_E
    print(np.sum(FPM > 0) / FPM.size)

In [None]:
plt.rcParams['text.usetex'] = False

In [None]:
frac_pos = []
a = 1
bs = np.linspace(0, -4, 10000)
for val in bs:
    FPM = a * FPM_K + 10 ** val * FPM_E
    frac_pos.append(np.sum(FPM > 0) / FPM.shape[0] / FPM.shape[1])
frac_pos = np.array(frac_pos)
    
plt.plot(10**bs, frac_pos)
plt.title('IPM = K + bE', fontsize=14)
plt.ylabel('Fraction of positive entries', fontsize=14)
plt.xlabel('b', fontsize=14)
plt.xscale('log')
plt.show()

In [None]:
betas_fraction = [
    10**bs[np.argmin(np.abs(frac_pos - 0.3))],
    10**bs[np.argmin(np.abs(frac_pos - 0.4))],
#     10**bs[np.argmin(np.abs(frac_pos - 0.5))],
    10**bs[np.argmin(np.abs(frac_pos - 0.6))],
    10**bs[np.argmin(np.abs(frac_pos - 0.7))],
]

In [None]:
len(selections['a1b000206s0p000'])

In [None]:
selections_knn['a1b000398s0p000']

In [None]:
np.round(betas_fraction, 5)

In [None]:
beta_02 = 10**bs[np.argmin(np.abs(frac_pos - 0.2))]
beta_05 = 10**bs[np.argmin(np.abs(frac_pos - 0.5))]
np.round(beta_05, 5)

In [None]:
0.00257

In [None]:
FPM = FPM_K + 1e-3 * FPM_E

p = 0.8
s = 10**2
s * (len(FPM)*(1-0.8))**2 / 10**6, FPM.sum() / 10**6

In [None]:
E_norm_col = sp.sparse.linalg.norm(E, axis=1)

In [None]:
K_norm_col = sp.sparse.linalg.norm(K, axis=1)

In [None]:
plt.hist(K_norm_col[K_norm_col != 0], bins=100)
plt.hist(E_norm_col[E_norm_col != 0], bins=100)
plt.show()

In [None]:
plt.hist(E_norm_col[E_norm_col != 0] - K_norm_col[K_norm_col != 0], bins=100)

In [None]:
FPM = FPM_K + 0.00253 * FPM_E

In [None]:
np.sum(FPM > 0) / FPM.size

In [None]:
FPM[FPM > 0]

In [None]:
np.linalg.norm(FPM_K[FPM_K > 0]) / np.linalg.norm(FPM_E[FPM_E > 0])

In [None]:
selections = get_selections(dr_fpm_knn)

In [None]:
np.sum(selections[f'a1b{str(bs[1]).replace(".","")}s0p000'])

In [None]:
bs = [0.00158, 0.00251, 0.00398, 0.00631]

In [None]:
class CQFSTTSampler:
    _tol: float

    def __init__(self, tol: float = 1e-4):
        self._tol = tol

    def _optimize_fixed_k_func(self, *args, W, k):
        x = torch.vstack(args)
        indicator = (x.sum(axis=0, keepdim=True) == k).to(DTYPE)
        x *= indicator
        return (x.T @ W @ x).diag()

    def _optimize_fixed_k(self, W, k, eps=1e-6):
        return tn.argmin(
            function=partial(self._optimize_fixed_k_func, W=W, k=k),
            rmax=100, max_iter=10, verbose=True, device=DEVICE,
            domain=[torch.tensor([0, 1])] * len(W),
        )

    def _optimize(self, W, eps=1e-6):
        N = W.shape[0]
        assert W.shape[1] == N

        # First term: x @ W @ x^T
        ts = []
        for n in range(N):
            c = torch.zeros([1, 2, 1])
            c[0, 1, 0] = 1
            t = tn.Tensor([c])[:, None]
            t.cores[1] = t.cores[1].repeat(1, N, 1)
            t.cores[1][:, :n, :] = 0
            t.cores[1][:, n + 1:, :] = 0
            t = tn.unsqueeze(t, list(range(n)) + list(range(n + 1, N)))
            t = t.repeat(*[2] * n, 1, *[2] * (N - n - 1), 1)
            ts.append(t)
        t = sum(ts)
        ts = []
        for n in range(N):
            trow = t.clone()
            trow.cores[-1] = torch.einsum('ijl,jk->ikl', trow.cores[-1],
                                          W[:, n:n + 1])
            ts.append(trow)
        term1 = tn.reduce(ts, operator.add, eps=eps)[..., 0]

        # Testing purposes
        # val = torch.randint(0, 2, [1000, N])
        # yhat = term1[list(val.t())].torch()
        # y = torch.einsum('ij,jk,ik->i', val.double(), W, val.double())
        # print(torch.linalg.norm(y-yhat) / torch.linalg.norm(y))

        # Second term
#         c = torch.eye(2, 2)[:, None, :].repeat(1, 2, 1)
#         c[1, 0, 0] = (-k) ** 2
#         c[1, 1, 0] = (1 - k) ** 2
#         xs = tn.Tensor([c[-1:, :, :]] + [c] * (N - 2) + [c[:, :, 0:1]])
#         term2 = s * tn.round_tt(xs, eps=eps)

        # Minimize tensor
        target = tn.round_tt(term1, eps=eps)
        return tn.argmin(target, rmax=100, max_iter=10, verbose=True)

    def sample(self, *, BQM):
        BQM = torch.tensor(BQM, dtype=DTYPE, device=DEVICE)
        f = self._optimize(BQM)
#         f = self._optimize_fixed_k(FPM, k)
        return np.array(f).astype(bool)
#         return {
#             i: f[i]
#             for i in range(len(f))
#         }

In [None]:
import tntorch as tn
import torch
import operator
from functools import partial

torch.set_default_dtype(torch.float64)
DTYPE = torch.float64
DEVICE = torch.device('cpu')

In [None]:
sampler = CQFSTTSampler()

In [None]:
selections_ours = {
    key: sampler.sample(BQM=BQM)
    for key, BQM in bqms_from_authors.items()
}

In [None]:
for key, BQM in bqms_from_authors.items():
    s_a = selections_from_authors[key]
    s_o = selections_ours[key]
    
    print(key)
    print('Authors: ', s_a.T @ BQM @ s_a)
    print('Ours:    ', s_o.T @ BQM @ s_o)
    print('----')

In [None]:
selections_ours


In [None]:
selections_from_authors

In [None]:
FPM = torch.randn(80, 80)

In [None]:
sampler.sample(FPM=FPM, s=0, k=10)