---

## Grid Search

In [10]:
import time
import os
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from itertools import chain
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
pd.options.mode.chained_assignment = None 

TRAINING_DIR = os.getcwd()
DATA_DIR = './data'
vectorfile = os.path.join(DATA_DIR, 'course_vecs.tsv')
infofile = os.path.join(DATA_DIR, 'course_info.tsv')
textcolumn = 'course_description'

In [11]:
def predict(course_vecs, course_descipts, trained_weights, trained_biases, num_words_per_course):
    """
    lalalal
    
    """
    df_with_keywords = course_descipts.copy()
    softmax_frame = course_vecs.iloc[:,1:].dot(trained_weights.values) + trained_biases # make predictions

    # From the softmax predictions, save the top 10 predicted words for each data point
    print('[INFO] Sorting classification results...')
    sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_words_per_course:]

    print('[INFO] Predicting top k inferred keywords for each course...')
    for i in range(num_words_per_course):
        new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
        df_with_keywords['predicted_word_' + str(num_words_per_course-i)] = new_col.values
        
    return df_with_keywords

def calculate_metric(df_with_keywords, metric):
    """
    metrics: {r: recall, p: precision}
    """
    def clean_descrip_title(row):
        punc_remover = str.maketrans('', '', string.punctuation)
        lowered = row['descrip_title'].lower()
        lowered_removed_punc = lowered.translate(punc_remover)
        cleaned_set = set(lowered_removed_punc.split())
        return cleaned_set

    def recall_keywords(row):
        return row['description_title_set'].intersection(row['course_keywords_set'])
    
    prediction_df = df_with_keywords.copy()
    only_predicted_keywords_df = prediction_df[prediction_df.columns.difference(['course_name', 'course_title', 'course_description', 'course_subject', 'course_alternative_names'])]
    num_keywords_predicted_per_course = only_predicted_keywords_df.shape[1]
    prediction_df['course_keywords'] = only_predicted_keywords_df.iloc[:,:].apply(lambda x: ', '.join(x), axis=1)
    prediction_df = prediction_df[['course_name', 'course_title', 'course_description', 'course_keywords', 'course_alternative_names']]
    prediction_df['course_keywords'] = prediction_df['course_keywords'].apply(lambda keywords: ', '.join(sorted(set([word.strip() for word in keywords.split(',')]))))
    prediction_df['course_keywords_set'] = prediction_df['course_keywords'].apply(lambda keywords: (set([word.strip() for word in keywords.split(',')])))
    prediction_df['descrip_title'] = prediction_df['course_title'] + ' ' + prediction_df['course_description']
    prediction_df['description_title_set'] = prediction_df.apply(clean_descrip_title, axis = 1)
    prediction_df['shared_words'] = prediction_df.apply(recall_keywords, axis = 1)
    
    if metric == 'r':
        print('[INFO] Calculating Recall...')
        assert num_keywords_predicted_per_course == max_descript_len, 'Number of keywords predicted should equal longest description length'
        prediction_df['recall'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / max_descript_len)
        average_recall = np.mean(prediction_df['recall'])
        return average_recall
    if metric == 'p':
        print('[INFO] Calculating Precision...')
        assert num_keywords_predicted_per_course == num_top_words, 'Number of keywords predicted should equal number of predicted words per course'
        prediction_df['precision'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / num_top_words)
        average_precision = np.mean(prediction_df['precision'])
        return average_precision
    if metric == 'c':
        print('[INFO] Calculating Cosine Similarity Between Keyword Distributions...')
        predicted_keyword_list = only_predicted_keywords_df.values.tolist()
        predicted_keyword_list = list(chain.from_iterable(predicted_keyword_list))
        keyword_counter = Counter(predicted_keyword_list)
        print('[INFO] Most common keywords by count: ', keyword_counter.most_common(10))
        
        num_possible_keywords = df_with_keywords.shape[0] * num_top_words
        num_predicted_keywords = len(keyword_counter.keys())
        assert sum(keyword_counter.values()) == split_Y_valid.shape[0] * num_top_words,\
        'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'
        unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
        predicted_keyword_vector = np.array(list(keyword_counter.values()))
        assert unif_keyword_vector.shape == predicted_keyword_vector.shape,\
        'Uniform keyword frequency vector should have same dimension as predicted keywords frequency vector.'
    
        cos_sim = cosine(predicted_keyword_vector, unif_keyword_vector)
        return cos_sim
    if metric == 'df':
        print('[INFO] Calculating Document Frequency of Predicted Keywords across Course Subjects...')
        document_df_cols = df_with_keywords.columns.difference(['course_title', 'course_description', 'course_name', 'course_alternative_names'])
        document_df = df_with_keywords.loc[:,document_df_cols]
        document_df.set_index('course_subject', inplace=True)
        
        document_dict = defaultdict(list)
        terms = set()
        for index, row in document_df.iterrows():
            document_dict[index].extend(row.values)
            terms.update(row.values)

        doc_freq_dict = defaultdict()
        num_docs = len(document_dict.keys())
        for term in terms:
            doc_freq_i = 0
            for key in document_dict.keys():
                if term in document_dict.get(key):
                    doc_freq_i += 1
            doc_freq_dict[term] = doc_freq_i / (num_docs)
            
        print('[INFO] Most common keywords by document frequencies: ', Counter(doc_freq_dict).most_common(10)) 
        average_document_frequency_score = np.mean(list(doc_freq_dict.values()))
        return average_document_frequency_score
        
def cosine_similarity(x, y):
    return 1 - cosine(x,y)


In [12]:
def get_vocab(dataframe, column, max_df=0.0028, use_idf=True):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[INFO] Number of unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[INFO] Number of bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(bigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[INFO] Number of trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [13]:
def to_bag_of_words(dataframe, column, vocab, tf_bias=.5, use_idf=True):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [14]:
def logistic_regression(X, Y, num_epochs=1):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [15]:
vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
info_frame = pd.read_csv(infofile, sep = '\t') # Course information

nonempty_indices = np.where(info_frame[textcolumn].notnull())[0]
filtered_vec_df = vec_frame.iloc[nonempty_indices,:].reset_index(drop = True)
filtered_descript_df = info_frame.iloc[nonempty_indices,:].reset_index(drop = True)
max_descript_len = max(filtered_descript_df.course_description.str.split().str.len())
num_top_words = 10

hyperparams_cols = ['use_idf', 'max_df','tf-bias', 'num_epochs', 'recall@max_len', 'precision@10', 'distribution_diff', 
                       'document_frequency']
grid_search_df = pd.DataFrame(columns=hyperparams_cols)

In [16]:
np.arange(0.002, .005, .001)

param_grid = {'use_idf': [True],
              'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
              'tf_bias': np.arange(.5, 1.5, .5), # np.arange(.5, 2, .5)
              'num_epochs': [5]} 

grid = ParameterGrid(param_grid)

for params in grid:
    print("[HYPERPARAMS] use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 1.000000, num_epochs: 5


In [17]:
# param_grid = {'use_idf': [True],
#               'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
#               'tf_bias': np.arange(.5, 1.5, .5), 
#               'num_epochs': [5]} 

# grid = ParameterGrid(param_grid)

recall_validation_scores = []
precision_validation_scores = []
distribution_validation_scores = []
document_frequency_validation_scores = []

for params in grid:
    print("***[INFO] Evaluating cross-validated model with hyperparams use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d***" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

    fold_num = 1
    kf = KFold(n_splits=5, random_state=42) # DO NOT FIX RANDOM STATE WHEN RUNNING THE ACTUAL EXPERIMENT - NVM, should be fixed for reproducibility
    for train_idx, valid_idx in kf.split(filtered_vec_df):
        print('======== [INFO] Fold %d' % (fold_num))
        # X = vectors, Y = descriptions
        split_X_train, split_X_valid = filtered_vec_df.iloc[train_idx], filtered_vec_df.iloc[valid_idx]
        split_Y_train, split_Y_valid = filtered_descript_df.iloc[train_idx], filtered_descript_df.iloc[valid_idx]

        vocab = get_vocab(split_Y_train, textcolumn, max_df=params['max_df'], use_idf=params['use_idf']) 
        vocab_frame = pd.DataFrame(vocab)
        vocabsize = len(vocab)

        # Convert the textcolumn of the raw dataframe into bag of words representation
        split_Y_train_BOW = to_bag_of_words(split_Y_train, textcolumn, vocab, tf_bias=params['tf_bias'], use_idf=params['use_idf'])
        split_Y_train_BOW = split_Y_train_BOW.toarray()

        (weights_frame, biases) = logistic_regression(split_X_train.iloc[:,1:], split_Y_train_BOW, num_epochs=params['num_epochs'])

        print('[INFO] Predicting on validation set for recall...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, max_descript_len)
        fold_i_average_recall = calculate_metric(df_with_keywords, 'r')
        recall_validation_scores.append(fold_i_average_recall)
        print('[INFO] Fold %d recall: %f.' % (fold_num, fold_i_average_recall))

        print('[INFO] Predicting on validation set for precision...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, num_top_words)
        fold_i_average_precision = calculate_metric(df_with_keywords, 'p')
        precision_validation_scores.append(fold_i_average_precision)
        print('[INFO] Fold %d precision: %f.' % (fold_num, fold_i_average_precision))

        fold_i_distribution_diff = calculate_metric(df_with_keywords, 'c')
        distribution_validation_scores.append(fold_i_distribution_diff)
        print('[INFO] Fold %d cosine similarity: %f.' % (fold_num, fold_i_distribution_diff))
        
        fold_i_document_frequency = calculate_metric(df_with_keywords, 'df')
        document_frequency_validation_scores.append(fold_i_document_frequency)
        print('[INFO] Fold %d document frequency: %f.' % (fold_num, fold_i_document_frequency))

        fold_num += 1

    recall_i = np.mean(recall_validation_scores)
    precision_i = np.mean(precision_validation_scores)
    distribution_diff_i = np.mean(distribution_validation_scores)
    document_frequency_i = np.mean(document_frequency_validation_scores)

    model_i_params = [params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs'], 
                      recall_i, precision_i, distribution_diff_i, document_frequency_i]
    model_i_params = pd.DataFrame([model_i_params], columns=hyperparams_cols)
    grid_search_df = grid_search_df.append(model_i_params, sort = False)
    print(grid_search_df)
    # print('recall scores:', recall_validation_scores)
    # print('precision scores:', precision_validation_scores)
    # print('distribution scores:', distribution_validation_scores)
    
grid_search_df.to_csv('./scores/score_file.csv')

***[INFO] Evaluating cross-validated model with hyperparams use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5***
[INFO] Getting vocab...
[INFO] Number of unigrams: 11382
[INFO] Number of bigrams: 1138
[INFO] Number of trigrams: 113
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 1 recall: 0.005869.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 1 precision: 0.035434.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('capitalist', 196), ('san', 183), ('francisco', 166), ('tables', 137), ('desire', 124), ('called', 104), ('resident

[INFO] Number of bigrams: 1109
[INFO] Number of trigrams: 110
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 5 recall: 0.004881.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 5 precision: 0.027525.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('colonization', 114), ('monuments', 103), ('iran', 91), ('extreme', 87), ('presume', 80), ('ce', 80), ('performances', 76), ('capitalist', 67), ('daily', 67), ('trading', 66)]
[INFO] Fold 5 cosine similarity: 0.416467.
[INFO] Calculating Document Frequency of Predicted Keywords across Course Subjects

[INFO] Number of unigrams: 10995
[INFO] Number of bigrams: 1099
[INFO] Number of trigrams: 109
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 4 recall: 0.005499.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 4 precision: 0.030305.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('just', 130), ('mexican', 112), ('republic', 111), ('birth', 105), ('constitute', 104), ('marx', 91), ('racism', 89), ('notion', 89), ('storytelling', 88), ('words', 84)]
[INFO] Fold 4 cosine similarity: 0.409575.
[INFO] Calculating Document Frequency of Predicted Key

[INFO] Number of unigrams: 11669
[INFO] Number of bigrams: 1166
[INFO] Number of trigrams: 116
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 3 recall: 0.008001.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 3 precision: 0.044136.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('continuing', 132), ('attempt', 113), ('pay', 104), ('russia', 104), ('semi', 100), ('twentieth', 96), ('created', 93), ('diaspora', 91), ('conquest', 91), ('moral', 83)]
[INFO] Fold 3 cosine similarity: 0.399379.
[INFO] Calculating Document Frequency of Predicted Key

[INFO] Number of unigrams: 11785
[INFO] Number of bigrams: 1178
[INFO] Number of trigrams: 117
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.007887.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 2 precision: 0.039363.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('pollution', 158), ('democratic', 136), ('meanings', 135), ('called', 130), ('boundaries', 125), ('liberal', 123), ('conventions', 119), ('right', 116), ('cold', 112), ('constitute', 108)]
[INFO] Fold 2 cosine similarity: 0.433894.
[INFO] Calculating Document Frequenc

[INFO] Number of unigrams: 12254
[INFO] Number of bigrams: 1225
[INFO] Number of trigrams: 122
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 1 recall: 0.010193.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 1 precision: 0.052913.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('reproduction', 166), ('citizenship', 125), ('built', 113), ('sense', 112), ('expository', 108), ('big', 101), ('country', 100), ('achieve', 95), ('details', 91), ('notions', 88)]
[INFO] Fold 1 cosine similarity: 0.425099.
[INFO] Calculating Document Frequency of Pred

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 5 recall: 0.008469.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 5 precision: 0.045831.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('looking', 146), ('old', 105), ('grading', 95), ('forum', 89), ('doctoral', 79), ('democratic', 74), ('attempts', 73), ('causal', 71), ('nervous', 70), ('seeks', 67)]
[INFO] Fold 5 cosine similarity: 0.423999.
[INFO] Calculating Document Frequency of Predicted Keywords across Course Subjects...
[INFO] Most common keywords by document frequencies:  [('looking', 0.2981366459627329), ('speak', 0.2360248447204969),

[INFO] Number of unigrams: 11904
[INFO] Number of bigrams: 1190
[INFO] Number of trigrams: 119
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 4 recall: 0.010177.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 4 precision: 0.052339.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[INFO] Most common keywords by count:  [('genre', 237), ('death', 147), ('identities', 141), ('histories', 129), ('peoples', 120), ('old', 118), ('lens', 105), ('actors', 102), ('indigenous', 89), ('terrestrial', 87)]
[INFO] Fold 4 cosine similarity: 0.448404.
[INFO] Calculating Document Frequency of Predict

In [19]:
200 * .025

5.0

In [18]:
grid_search_df

Unnamed: 0,use_idf,max_df,tf-bias,num_epochs,recall@max_len,precision@10,distribution_diff,document_frequency
0,True,0.002,0.5,5,0.005584,0.032018,0.408953,0.025828
0,True,0.002,1.0,5,0.005569,0.031211,0.422874,0.026112
0,True,0.003,0.5,5,0.006319,0.034756,0.418333,0.026013
0,True,0.003,1.0,5,0.006646,0.035878,0.419269,0.026147
0,True,0.004,0.5,5,0.007339,0.039324,0.419226,0.026097
0,True,0.004,1.0,5,0.007752,0.041184,0.421222,0.026158
