In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


ml_path = './learning_data/'
sedkgraph_path = './sedkgraph_data/'

# Load Data

In [2]:
with open(ml_path+'X_test_raw.pkl','rb') as f:
    X_test_raw = pickle.load(f)
with open(ml_path+'y_test.pkl','rb') as f:
    y_test = pickle.load(f)
with open(ml_path+'topics.pkl','rb') as f:
    topics = pickle.load(f)
    
sedkgraph = pd.read_csv(sedkgraph_path+'sedkgraph.csv')
topics_numeric = pd.read_csv(sedkgraph_path+'topics_numerical_data.csv')
test_samples_count = y_test.shape[0]

# Calculate simplet rate =
### 0.5*(popularity_softened+rel_count_softened)

In [3]:
topics_numeric['rate'] = 0.5*(topics_numeric['popularity_softened']+topics_numeric['rel_count_softened'])

# Load LR Model

In [4]:
with open(ml_path+'tfidf_vectorizer.pkl','rb') as f:
    tfidf_vectorizer_lr = pickle.load(f)
with open(ml_path+'ovr_clf.pkl','rb') as f:
    ovr_clf = pickle.load(f)

# Helper function to get the top-k results

In [5]:
def get_top_k(probs, k):
    results = np.zeros_like(probs)
    for i in range(probs.shape[0]):
        indexes = probs[i,:].argsort()[-k:][::-1]
        results[i,indexes] = 1.0
    return results

# Helper function to get the recommendations and their scores in a dict format

In [6]:
def get_topic_names(recoms, k, topics=topics):
    indices = list(np.where(recoms > 0)[0])
    result = sorted([(topics[x],recoms[x]) for x in indices], key=lambda e: e[1], reverse=True)
    if len(result)<k:
        result = result + [('no-more-recoms',0)]*(k-len(result))
    return result

def vectorized_get_topic_names(recoms_array, k, topics=topics):
    results = np.apply_along_axis(get_topic_names, 1, recoms_array, k)
    return results

# Set K

In [7]:
k_ml = 3
k_kgrec = 2
k_saved_files = '5_3_2'

# Results from LR (Top-k)

In [8]:
X_test_lr = tfidf_vectorizer_lr.transform(X_test_raw.input.values)
lr_predictions = ovr_clf.predict_proba(X_test_lr)
lr_top_k = get_top_k(lr_predictions, k_ml).astype(int)

lr_results = lr_predictions*lr_top_k
lr_recoms = vectorized_get_topic_names(lr_results, k_ml)

# Helper functions to get KGRec recommendations

In [9]:
exclude_from_recom = ['programming-language']

flatten = lambda x: [item for sublist in x for item in sublist]

def get_recom_scores(already_assigned,candidates, topics_numeric=topics_numeric, exclude_from_recom=exclude_from_recom):
    candidates_list = list(set(flatten(candidates.values())).difference(set(exclude_from_recom)))
    result = []
    
    for candidate in candidates_list:
        score = 0
        for recom,candidate_sublist in candidates.items():
            if candidate in candidate_sublist:
                score += already_assigned[recom]
        score *= float(topics_numeric[topics_numeric.topic==candidate]['rate'])
        
        result.append((candidate,float(score)))
    
    result = sorted(result, key=lambda x: x[1], reverse=True)
    try:
        max_score = result[0][1]
        result = list(map(lambda x: (x[0],x[1]/max_score), result))
    except IndexError as e:
        pass
    
    return result

def get_kgrec_recoms(already_assigned, k, sedkgraph=sedkgraph, topics_numeric=topics_numeric):
    already_assigned = dict(map(lambda x: (x[0],float(x[1])), already_assigned))
    already_assigned_list = list(already_assigned.keys())
    
    candidates = {}
    for recom in already_assigned_list:
        if already_assigned[recom] == 0:
            break
        tmp_list = list(sedkgraph[sedkgraph.rhs==recom].lhs.values)+list(sedkgraph[sedkgraph.lhs==recom].rhs.values)
        candidates[recom] = list(set(tmp_list).difference(set(already_assigned_list)))
    
    recom_scores = get_recom_scores(already_assigned,candidates)
    
    if len(recom_scores)<k:
        recom_scores = recom_scores + [('no-more-recoms',0)]*(k-len(recom_scores))
    
    return np.array(recom_scores)[:k]

def prep_already_assigned(already_assigned_array):
    max_rec_len = max(list(map(lambda x: len(x), already_assigned_array)))
    result = list(map(lambda x: x+[('no-more-recoms',0)]*(max_rec_len-len(x)),already_assigned_array))
    return np.array(result)

def vectorized_get_kgrec_recoms(already_assigned_array, k, sedkgraph=sedkgraph, topics_numeric=topics_numeric):
    if not isinstance(already_assigned_array,np.ndarray):
        already_assigned_array = prep_already_assigned(already_assigned_array)
    shape = already_assigned_array.shape
    results = np.apply_along_axis(lambda x: get_kgrec_recoms(x.reshape(shape[1],shape[2]), k),
                                  1, already_assigned_array.reshape(shape[0],-1))
    return np.array(results)

# Results from KGRec (Top-k)

In [10]:
kgrec_recoms = vectorized_get_kgrec_recoms(lr_recoms, k_kgrec)

# Report on Failures

In [11]:
failed_k = []
failed_0 = []

for i in range(kgrec_recoms.shape[0]):
    recoms = list(set(map(lambda x: x[0] ,kgrec_recoms[i])))
    if 'no-more-recoms' in recoms:
        failed_k.append(i)
        if len(recoms)==1:
            failed_0.append(i)

print(f'\nKGRec failed to make {k_kgrec} recommendations for {np.round(len(failed_k)*100/test_samples_count, 2)}% of the {test_samples_count} test repositoried.')
print(f'\nKGRec made NO recommendations for {np.round(len(failed_0)*100/test_samples_count, 2)}% of the {test_samples_count} test repositoried.')


KGRec failed to make 2 recommendations for 0.02% of the 29050 test repositoried.

KGRec made NO recommendations for 0.0% of the 29050 test repositoried.


# Save Results

In [12]:
try:
    os.makedirs('./results/')
except:
    pass

with open(f'./results/lr_kgrec_top{k_saved_files}_test_results_ml_based.pkl','wb') as f:
    pickle.dump(lr_recoms, f)
with open(f'./results/lr_kgrec_top{k_saved_files}_test_results_graph_based.pkl','wb') as f:
    pickle.dump(kgrec_recoms, f)