In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity


ml_path = './learning_data/'

# Load Data

In [2]:
with open(ml_path+'X_test_raw.pkl','rb') as f:
    X_test_raw = pickle.load(f)
with open(ml_path+'y_test.pkl','rb') as f:
    y_test = pickle.load(f)
with open(ml_path+'y_train.pkl','rb') as f:
    y_train = pickle.load(f)
with open(ml_path+'topics.pkl','rb') as f:
    topics = pickle.load(f)

test_samples_count = y_test.shape[0]

# Load LR Model

In [3]:
with open(ml_path+'tfidf_vectorizer.pkl','rb') as f:
    tfidf_vectorizer_lr = pickle.load(f)
with open(ml_path+'ovr_clf.pkl','rb') as f:
    ovr_clf = pickle.load(f)

# Helper function to get the top-k results for n samples in an n*m matrix

In [4]:
def get_top_k(values, k):
    results = np.zeros_like(values)
    for i in range(values.shape[0]):
        indexes = values[i,:].argsort()[-k:][::-1]
        results[i,indexes] = 1.0
    return results

# Helper function to get the recommendations and their scores in a dict format

In [5]:
def get_topic_names(recoms, k, topics=topics):
    indices = list(np.where(recoms > 0)[0])
    result = sorted([(topics[x],recoms[x]) for x in indices], key=lambda e: e[1], reverse=True)
    if len(result)<k:
        result = result + [('no-more-recoms',0)]*(k-len(result))
    return result

def vectorized_get_topic_names(recoms_array, k, topics=topics):
    results = np.apply_along_axis(get_topic_names, 1, recoms_array, k)
    return results

# Set K

In [6]:
k_ml = 3
k_topfilter = 2
k_similarity = 25
k_saved_files = '5_3_2'

# Results from LR (Top-k)

In [7]:
X_test_lr = tfidf_vectorizer_lr.transform(X_test_raw.input.values)
lr_predictions = ovr_clf.predict_proba(X_test_lr)
lr_top_k = get_top_k(lr_predictions, k_ml).astype(int)

lr_results = lr_predictions*lr_top_k
lr_recoms = vectorized_get_topic_names(lr_results, k_ml)

# Similarity (Top-k^2)

In [8]:
sims_values = cosine_similarity(lr_top_k, y_train)
sims_top_k2 = get_top_k(sims_values, k_similarity)
sims_filtered = sims_top_k2 * sims_values

# Make Recommendations (Top-k)

In [9]:
topfilter_ratings = np.matmul(sims_filtered, y_train)/sims_filtered.sum(axis=1)[:,None]
topfilter_ratings[np.where(lr_results>0)] = 0
topfilter_top_k = get_top_k(topfilter_ratings, k_topfilter)

topfilter_results = topfilter_ratings * topfilter_top_k
topfilter_recoms = vectorized_get_topic_names(topfilter_results, k_topfilter)

# Report on Failures

In [10]:
failed_k = []
failed_0 = []

for i in range(topfilter_ratings.shape[0]):
    if len(np.where(topfilter_ratings[i,:]>0)[0])<k_topfilter:
        failed_k.append(i)
    if len(np.where(topfilter_ratings[i,:]>0)[0])==0:
        failed_0.append(i)

print(f'\nTopfilter failed to make {k_topfilter} recommendations for {np.round(len(failed_k)*100/test_samples_count, 2)}% of the {test_samples_count} test repositoried.')
print(f'\nTopfilter made NO recommendations for {np.round(len(failed_0)*100/test_samples_count, 2)}% of the {test_samples_count} test repositoried.')


Topfilter failed to make 2 recommendations for 43.16% of the 29050 test repositoried.

Topfilter made NO recommendations for 30.06% of the 29050 test repositoried.


# Save Results

In [11]:
try:
    os.makedirs('./results/')
except:
    pass

with open(f'./results/lr_topfilter_top{k_saved_files}_test_results_ml_based.pkl','wb') as f:
    pickle.dump(lr_recoms, f)
with open(f'./results/lr_topfilter_top{k_saved_files}_test_results_sim_based.pkl','wb') as f:
    pickle.dump(topfilter_recoms, f)