In [1]:
import time
import os
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
pd.options.mode.chained_assignment = None 

Using TensorFlow backend.


In [2]:
TRAINING_DIR = os.getcwd()
vectorfile = os.path.join(TRAINING_DIR, 'course_vecs.tsv')
infofile = os.path.join(TRAINING_DIR, 'course_info.tsv')
textcolumn = 'course_description'
# num_top_words = 10
# use_idf = True
# tf_bias = .5
# num_epochs = 5
# max_df = 0.0028

---

In [23]:
def get_vocab(dataframe, column):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[UNIGRAMS] Number of unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[BIGRAMS] Number of bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(bigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[TRIGRAMS] Number of trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [24]:
def to_bag_of_words(dataframe, column, vocab):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [25]:
def logistic_regression(X, Y):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [26]:
def clean_descrip_title(row):
    punc_remover = str.maketrans('', '', string.punctuation)
    lowered = row['descrip_title'].lower()
    lowered_removed_punc = lowered.translate(punc_remover)
    cleaned_set = set(lowered_removed_punc.split())
    return cleaned_set

def recall_keywords(row):
    return row['description_title_set'].intersection(row['keywords_set'])

In [97]:
def test(x=1):
    print(x)
test(5)

5


---

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
raw_frame = pd.read_csv(rawfile, sep = '\t') # Course information

nonempty_indices = np.where(raw_frame[textcolumn].notnull() == True)[0]
filtered_vec_frame = vec_frame.iloc[nonempty_indices,:]
filtered_raw_frame = raw_frame.iloc[nonempty_indices,:]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(filtered_vec_frame, filtered_raw_frame, test_size=0.2, random_state=42)

print(X_train.shape[0], X_test.shape[0])

In [None]:
vocab = get_vocab(Y_train, textcolumn) # get_vocab(raw_frame, textcolumn) 
vocab_frame = pd.DataFrame(vocab)
    
vocabsize = len(vocab)

# Convert the textcolumn of the raw dataframe into bag of words representation
Y_train_BOW = to_bag_of_words(Y_train, textcolumn, vocab)
Y_train_BOW = Y_train_BOW.toarray()
Y_train_BOW

In [None]:
(weights_frame, biases) = logistic_regression(X_train.iloc[:,1:], Y_train_BOW)

In [None]:
softmax_frame = X_test.iloc[:,1:].dot(weights_frame.values) + biases
print('[INFO] Sorting classification results...')
sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_top_words:]

predicted_keyword_list = []
for i in range(num_top_words):
    new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
    predicted_keyword_list.extend(new_col.values)
    Y_test['predicted_word_' + str(num_top_words-i)] = new_col.values


In [None]:
Y_test.head(3)

In [None]:
from collections import Counter

keyword_counter = Counter(predicted_keyword_list)

keyword_counter.most_common(10)

In [None]:
num_possible_keywords = Y_test.shape[0] * num_top_words
num_predicted_keywords = len(keyword_counter.keys())

In [None]:
assert sum(keyword_counter.values()) == Y_test.shape[0] * num_top_words,\
'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'

In [None]:
unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
unif_keyword_vector

In [None]:
predicted_keyword_vector = np.array(list(keyword_counter.values()))
predicted_keyword_vector

In [None]:
assert unif_keyword_vector.shape == predicted_keyword_vector.shape

In [None]:
from scipy.spatial.distance import cosine

def cosine_similarity(x, y):
    return 1 - cosine(x,y)

cosine_similarity(predicted_keyword_vector, unif_keyword_vector)

In [None]:
cosine_similarity([1,-1], [1,1])

---

## Grid Search

In [3]:
def predict(course_vecs, course_descipts, trained_weights, trained_biases, num_words_per_course):
    """
    lalalal
    
    """
    df_with_keywords = course_descipts.copy()
    softmax_frame = course_vecs.iloc[:,1:].dot(trained_weights.values) + trained_biases # make predictions

    # From the softmax predictions, save the top 10 predicted words for each data point
    print('[INFO] Sorting classification results...')
    sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_words_per_course:]

    print('[INFO] Predicting top k inferred keywords for each course...')
    for i in range(num_words_per_course):
        new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
        df_with_keywords['predicted_word_' + str(num_words_per_course-i)] = new_col.values
        
    return df_with_keywords

def calculate_metric(df_with_keywords, metric):
    """
    metrics: {r: recall, p: precision}
    """
    def clean_descrip_title(row):
        punc_remover = str.maketrans('', '', string.punctuation)
        lowered = row['descrip_title'].lower()
        lowered_removed_punc = lowered.translate(punc_remover)
        cleaned_set = set(lowered_removed_punc.split())
        return cleaned_set

    def recall_keywords(row):
        return row['description_title_set'].intersection(row['course_keywords_set'])
    
    prediction_df = df_with_keywords.copy()
    only_predicted_keywords_df = prediction_df[prediction_df.columns.difference(['course_name', 'course_title', 'course_description', 'tf_bias', 'course_alternative_names'])]
    num_keywords_predicted = only_predicted_keywords_df.shape[1]
    prediction_df['course_keywords'] = only_predicted_keywords_df.iloc[:,:].apply(lambda x: ', '.join(x), axis=1)
    prediction_df = prediction_df[['course_name', 'course_title', 'course_description', 'course_keywords', 'course_alternative_names']]
    prediction_df['course_keywords'] = prediction_df['course_keywords'].apply(lambda keywords: ', '.join(sorted(set([word.strip() for word in keywords.split(',')]))))
    prediction_df['course_keywords_set'] = prediction_df['course_keywords'].apply(lambda keywords: (set([word.strip() for word in keywords.split(',')])))
    prediction_df['descrip_title'] = prediction_df['course_title'] + ' ' + prediction_df['course_description']
    prediction_df['description_title_set'] = prediction_df.apply(clean_descrip_title, axis = 1)
    prediction_df['shared_words'] = prediction_df.apply(recall_keywords, axis = 1)
    
    if metric == 'r':
        print('[INFO] Calculating Recall...')
        assert num_keywords_predicted == max_descript_len, 'Number of keywords predicted should equal longest description length'
        prediction_df['recall'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / max_descript_len)
        average_recall = np.mean(prediction_df['recall'])
        return average_recall
    if metric == 'p':
        print('[INFO] Calculating Precision...')
        assert num_keywords_predicted == num_top_words, 'Number of keywords predicted should equal number of predicted words per course'
        prediction_df['precision'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / num_top_words)
        average_precision = np.mean(prediction_df['precision'])
        return average_precision
    if metric == 'c':
        print('[INFO] Calculating Cosine Similarity Between Keyword Distributions...')
        predicted_keyword_list = only_predicted_keywords_df.values.tolist()
        predicted_keyword_list = list(chain.from_iterable(predicted_keyword_list))
        keyword_counter = Counter(predicted_keyword_list)
        
        num_possible_keywords = df_with_keywords.shape[0] * num_top_words
        num_predicted_keywords = len(keyword_counter.keys())
        assert sum(keyword_counter.values()) == split_Y_valid.shape[0] * num_top_words,\
        'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'
        unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
        predicted_keyword_vector = np.array(list(keyword_counter.values()))
        assert unif_keyword_vector.shape == predicted_keyword_vector.shape,\
        'Uniform keyword frequency vector should have same dimension as predicted keywords frequency vector.'
    
        cos_sim = cosine_similarity(predicted_keyword_vector, unif_keyword_vector)
        return cos_sim

def cosine_similarity(x, y):
    return 1 - cosine(x,y)


In [None]:
def get_vocab(dataframe, column, max_df=0.0028, use_idf=True):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[INFO] Number of unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[INFO] Number of bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(bigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[INFO] Number of trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [5]:
def to_bag_of_words(dataframe, column, vocab, tf_bias=.5, use_idf=True):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [6]:
def logistic_regression(X, Y, num_epochs=1):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [7]:
vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
info_frame = pd.read_csv(infofile, sep = '\t') # Course information

nonempty_indices = np.where(info_frame[textcolumn].notnull())[0]
filtered_vec_df = vec_frame.iloc[nonempty_indices,:].reset_index(drop = True)
filtered_descript_df = info_frame.iloc[nonempty_indices,:].reset_index(drop = True)
max_descript_len = max(filtered_descript_df.course_description.str.split().str.len())
num_top_words = 10

hyperparams_cols = ['use_idf', 'max_df','tf-bias', 'num_epochs', 'recall', 'precision', 'distribution_diff']
grid_search_df = pd.DataFrame(columns=hyperparams_cols)

In [11]:
np.arange(0.002, .005, .001)

param_grid = {'use_idf': [True],
              'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
              'tf_bias': np.arange(.5, 1.5, .5), 
              'num_epochs': [5]} 

grid = ParameterGrid(param_grid)

for params in grid:
    print("[HYPERPARAMS] use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 1.000000, num_epochs: 5


In [None]:
param_grid = {'use_idf': [True],
              'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
              'tf_bias': np.arange(.5, 1.5, .5), 
              'num_epochs': [5]} 

grid = ParameterGrid(param_grid)

recall_validation_scores = []
precision_validation_scores = []
distribution_validation_scores = []

for params in grid:
    print("****** [INFO] Evaluating cross-validated model with use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d ******" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

    fold_num = 1
    kf = KFold(n_splits=5, random_state=42) # DO NOT FIX RANDOM STATE WHEN RUNNING THE ACTUAL EXPERIMENT - NVM, should be fixed for reproducibility
    for train_idx, valid_idx in kf.split(filtered_vec_df):
        print('\t [INFO] Fold %d' % (fold_num))
        # X = vectors, Y = descriptions
        split_X_train, split_X_valid = filtered_vec_df.iloc[train_idx], filtered_vec_df.iloc[valid_idx]
        split_Y_train, split_Y_valid = filtered_descript_df.iloc[train_idx], filtered_descript_df.iloc[valid_idx]

        vocab = get_vocab(split_Y_train, textcolumn, max_df=params['max_df'], use_idf=params['use_idf']) 
        vocab_frame = pd.DataFrame(vocab)
        vocabsize = len(vocab)

        # Convert the textcolumn of the raw dataframe into bag of words representation
        split_Y_train_BOW = to_bag_of_words(split_Y_train, textcolumn, vocab, tf_bias=params['tf_bias'], use_idf=params['use_idf'])
        split_Y_train_BOW = split_Y_train_BOW.toarray()

        (weights_frame, biases) = logistic_regression(split_X_train.iloc[:,1:], split_Y_train_BOW, num_epochs=params['num_epochs'])

        print('[INFO] Predicting on validation set for recall...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, max_descript_len)
        fold_i_average_recall = calculate_metric(df_with_keywords, 'r')
        recall_validation_scores.append(fold_i_average_recall)
        print('[INFO] Fold %d recall: %f.' % (fold_num, fold_i_average_recall))

        print('[INFO] Predicting on validation set for precision...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, num_top_words)
        fold_i_average_precision = calculate_metric(df_with_keywords, 'p')
        precision_validation_scores.append(fold_i_average_precision)
        print('[INFO] Fold %d precision: %f.' % (fold_num, fold_i_average_precision))

        fold_i_distribution_diff = calculate_metric(df_with_keywords, 'c')
        distribution_validation_scores.append(fold_i_distribution_diff)
        print('[INFO] Fold %d cosine similarity: %f.' % (fold_num, fold_i_distribution_diff))

        fold_num += 1

    recall_i = np.mean(recall_validation_scores)
    precision_i = np.mean(precision_validation_scores)
    distribution_diff_i = np.mean(distribution_validation_scores)

    model_i_params = [params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs'], 
                      recall_i, precision_i, distribution_diff_i]
    model_i_params = pd.DataFrame([model_i_params], columns=hyperparams_cols)
    grid_search_df = grid_search_df.append(model_i_params, sort = False)
    print(grid_search_df)
    # print('recall scores:', recall_validation_scores)
    # print('precision scores:', precision_validation_scores)
    # print('distribution scores:', distribution_validation_scores)

[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5
[CV] Running cross validation...
[INFO] ****** Fold 1 ******
[INFO] Getting vocab...
[INFO] number unigrams: 11379
[INFO] number bigrams: 1137
[INFO] number trigrams: 1137
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 1 recall: 0.004462.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 1 precision: 0.020528.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Fold 1 cosine similarity: 0.464311.
[INFO] ****** Fold 2 ******
[INFO] Getting vocab...
[INFO] number unigrams: 11214
[INFO] number bigrams: 1121

Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.004611.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 2 precision: 0.019932.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Fold 2 cosine similarity: 0.432072.
[INFO] ****** Fold 3 ******
[INFO] Getting vocab...
[INFO] number unigrams: 11104
[INFO] number bigrams: 1110
[INFO] number trigrams: 1110
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold

[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 3 recall: 0.006023.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 3 precision: 0.029153.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Fold 3 cosine similarity: 0.385184.
[INFO] ****** Fold 4 ******
[INFO] Getting vocab...
[INFO] number unigrams: 11564
[INFO] number bigrams: 1156
[INFO] number trigrams: 1156
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 4 recall: 0.005630.
[INFO] Pr

[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 4 precision: 0.023729.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Fold 4 cosine similarity: 0.357743.
[INFO] ****** Fold 5 ******
[INFO] Getting vocab...
[INFO] number unigrams: 11666
[INFO] number bigrams: 1166
[INFO] number trigrams: 1166
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 5 recall: 0.004075.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
