In [133]:
import time
import os
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
pd.options.mode.chained_assignment = None 

TRAINING_DIR = os.getcwd()
vectorfile = os.path.join(TRAINING_DIR, 'course_vecs.tsv')
infofile = os.path.join(TRAINING_DIR, 'course_info.tsv')
textcolumn = 'course_description'

In [134]:
TRAINING_DIR = os.getcwd()
vectorfile = os.path.join(TRAINING_DIR, 'course_vecs.tsv')
infofile = os.path.join(TRAINING_DIR, 'course_info.tsv')
textcolumn = 'course_description'
num_top_words = 10
use_idf = True
tf_bias = .5
num_epochs = 5
max_df = 0.0028

---

In [135]:
def get_vocab(dataframe, column):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[UNIGRAMS] Number of unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[BIGRAMS] Number of bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(bigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[TRIGRAMS] Number of trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [136]:
def to_bag_of_words(dataframe, column, vocab):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [137]:
def logistic_regression(X, Y):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [138]:
def clean_descrip_title(row):
    punc_remover = str.maketrans('', '', string.punctuation)
    lowered = row['descrip_title'].lower()
    lowered_removed_punc = lowered.translate(punc_remover)
    cleaned_set = set(lowered_removed_punc.split())
    return cleaned_set

def recall_keywords(row):
    return row['description_title_set'].intersection(row['keywords_set'])

In [139]:
def test(x=1):
    print(x)
test(5)

5


---

## Dummy Experiment with Only 1 split

In [155]:
from sklearn.model_selection import train_test_split

vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
info_frame = pd.read_csv(infofile, sep = '\t') # Course information
info_frame.head()

Unnamed: 0,course_name,course_title,course_description,course_subject,course_alternative_names
0,Xart 98,Directed Group Study,This is a student-initiated course to be offer...,FPF-Art Practice,FPF-Art Practice 98 XART98
1,Xanthro 2ac,Introduction to Archaeology,Prehistory and cultural growth. Introduction t...,FPF-Anthropology,FPF-Anthropology 2AC XANTHRO2AC
2,Xstat 2,Introduction to Statistics,Population and variables. Standard measures of...,FPF-Statistics,FPF-Statistics 2 XSTAT2
3,Xmath 1b,Calculus,Continuation of 1A. Techniques of integration;...,FPF-Mathematics,FPF-Mathematics 1B XMATH1B
4,Xphilos 3,The Nature of Mind,Introduction to the philosophy of mind. Topics...,FPF-Philosophy,FPF-Philosophy 3 XPHILOS3


In [156]:
# info_frame['abbr_cid'] = info_frame.course_name.str.replace(' ', '_').str.upper()
# api_df = pd.read_csv('/home/matthew/Models-AskOski/shared/course_info.tsv', sep = '\t', ).drop(['Unnamed: 0', 'idx', 'updated_date'],axis=1)
# api_df.head(5) 
# any(api_df.course_subject.isnull())
# temp = pd.merge(info_frame, api_df, on='abbr_cid')[['course_name', 'course_title', 'course_description', 'course_alternative_names', 'course_subject']]


In [157]:
nonempty_indices = np.where(info_frame[textcolumn].notnull())[0]
filtered_vec_df = vec_frame.iloc[nonempty_indices,:].reset_index(drop = True)
filtered_descript_df = info_frame.iloc[nonempty_indices,:].reset_index(drop = True)
max_descript_len = max(filtered_descript_df.course_description.str.split().str.len())

In [158]:
X_train, X_test, Y_train, Y_test = train_test_split(filtered_vec_df, filtered_descript_df, test_size=0.2, random_state=42)

print(X_train.shape[0], X_test.shape[0])

5901 1476


In [160]:
vocab = get_vocab(Y_train, textcolumn) # get_vocab(raw_frame, textcolumn) 
vocab_frame = pd.DataFrame(vocab)
    
vocabsize = len(vocab)

# Convert the textcolumn of the raw dataframe into bag of words representation
Y_train_BOW = to_bag_of_words(Y_train, textcolumn, vocab)
Y_train_BOW = Y_train_BOW.toarray()
Y_train_BOW

[INFO] Getting vocab...
[UNIGRAMS] Number of unigrams: 11580
[BIGRAMS] Number of bigrams: 1158
[TRIGRAMS] Number of trigrams: 115


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [161]:
(weights_frame, biases) = logistic_regression(X_train.iloc[:,1:], Y_train_BOW)

[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [162]:
softmax_frame = X_test.iloc[:,1:].dot(weights_frame.values) + biases
print('[INFO] Sorting classification results...')
sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_top_words:]

predicted_keyword_list = []
for i in range(num_top_words):
    new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
    predicted_keyword_list.extend(new_col.values)
    Y_test['predicted_word_' + str(num_top_words-i)] = new_col.values


[INFO] Sorting classification results...


In [196]:
Y_test.sample(5)

Unnamed: 0,course_name,course_title,course_description,course_subject,course_alternative_names,predicted_word_10,predicted_word_9,predicted_word_8,predicted_word_7,predicted_word_6,predicted_word_5,predicted_word_4,predicted_word_3,predicted_word_2,predicted_word_1
2288,Theater 52ac,Dance in American Cultures,Dance as a meaning-making expressive form. De...,Theater Dance & Perf Stds,Theater Dance & Perf Stds 52AC THEATER52AC,explorations,theatre,recording,formations,allowing,performing,performances,supplies,vocal,productions
6602,Slavic 138,Topics in Russian and Soviet Film,This course will examine the Russian contribut...,Slavic Languages & Lit,Slavic Languages & Lit 138 SLAVIC138,folklore,looking,aspect,soviet,beginner,avant,viewing,garde,russia,slavic
4483,Art 301,The Teaching of Art: Practice,Utilizing aspects of pedagogical and andragogi...,Art Practice,Art Practice 301 ART301,notion,communicating,centered,print,audience,avant,garde,performances,recording,talks
5321,Env sci 100,Introduction to the Methods of Environmental S...,Introduction to basic methods used in environm...,Environmental Sciences,Environmental Sciences 100 ENV SCI100,remote,fate,greatest,habitat,trip,insects,agricultural,wildlife,geologic,espm
5447,Nuc eng 221,Corrosion in Nuclear Power Systems,Structural metals in nuclear power plants; pro...,Nuclear Engineering,Nuclear Engineering 221 NUC ENG221,calculations,radioactive,fast,neutron,beam,fission,reactors,fuel,fusion,reactor


In [207]:
doc_freq_df_cols = Y_test.columns.difference(['course_title', 'course_description', 'course_name', 'course_alternative_names'])
doc_df = Y_test.loc[:,doc_freq_df_cols]
test = doc_df.loc[doc_df.course_subject.str.contains('History')]
doc_df.set_index('course_subject', inplace=True)
doc_df.head()

Unnamed: 0_level_0,predicted_word_1,predicted_word_10,predicted_word_2,predicted_word_3,predicted_word_4,predicted_word_5,predicted_word_6,predicted_word_7,predicted_word_8,predicted_word_9
course_subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Engineering,sponsoring,income,professionally,matlab,homework,layout,dependent,advisor,quickly,allocation
History,powers,rule,edu,russia,agricultural,rome,trace,evolved,looking,cold
Public Health,concern,survival,fertility,demographic,residents,demography,credential,influenced,multivariate,generalized
Evening & Weekend MBA,arbitrage,dependent,company,managed,workers,resulting,enabling,drug,practiced,diagnostic
History,did,socialism,powers,cold,rural,trace,formations,modernism,struggles,peace


In [208]:
from collections import defaultdict

In [209]:
test.set_index('course_subject', inplace=True)
test.head()

Unnamed: 0_level_0,predicted_word_1,predicted_word_10,predicted_word_2,predicted_word_3,predicted_word_4,predicted_word_5,predicted_word_6,predicted_word_7,predicted_word_8,predicted_word_9
course_subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
History,powers,rule,edu,russia,agricultural,rome,trace,evolved,looking,cold
History,did,socialism,powers,cold,rural,trace,formations,modernism,struggles,peace
History,iran,performing,france,did,crisis,west,evolved,supplies,enlightenment,republic
History of Art,economists,allocation,square,investigating,correlation,income,outcome,solubility,fair,optometric
History of Art,doe,france,listings,formations,audience,spain,desirable,ca,ideological,modernism


In [255]:
document_dict = defaultdict(list)
terms = set()
for index,row in doc_df.iterrows():
    document_dict[index].extend(row.values)
    terms.update(row.values)
# doc_freq_dict
len(terms)

1354

In [254]:
num_docs = len(document_dict.keys())
doc_freq_i = 0
for key in document_dict.keys():
    if 'performing' in document_dict.get(key):
        doc_freq_i += 1
#         print(doc_freq_i)
doc_freq_i / num_docs

0.6666666666666666

In [258]:
doc_freq_dict = defaultdict()
num_docs = len(document_dict.keys())
for term in terms:
    doc_freq_i = 0
#     print(term)
    for key in document_dict.keys():
        if term in document_dict.get(key):
            doc_freq_i += 1
    doc_freq_dict[term] = doc_freq_i / (num_docs + 1)


In [271]:
Counter(doc_freq_dict).most_common(10)
# sorted(doc_freq_dict, key=doc_freq_dict.get, reverse=True)[:10]

[('powers', 0.2822085889570552),
 ('formations', 0.27607361963190186),
 ('performing', 0.25766871165644173),
 ('agricultural', 0.24539877300613497),
 ('trace', 0.22085889570552147),
 ('informed', 0.2147239263803681),
 ('crisis', 0.20245398773006135),
 ('did', 0.17791411042944785),
 ('modernism', 0.17791411042944785),
 ('recording', 0.17791411042944785)]

In [195]:
from collections import Counter

keyword_counter = Counter(predicted_keyword_list)

keyword_counter.most_common(15)

[('trace', 150),
 ('powers', 148),
 ('formations', 133),
 ('agricultural', 111),
 ('performing', 108),
 ('struggles', 92),
 ('looking', 91),
 ('income', 90),
 ('authority', 86),
 ('crisis', 84),
 ('did', 80),
 ('chronological', 72),
 ('evolved', 71),
 ('modernism', 69),
 ('republic', 68)]

In [201]:
len(keyword_counter)

1354

In [18]:
dpt_file = infofile = os.path.join(TRAINING_DIR, 'academic_departments.tsv')
dpt_df = pd.read_csv(dpt_file, sep='\t')
dpt_df.head()

Unnamed: 0,ACADEMIC_DEPARTMENT_NAME,ACADEMIC_DIVISION_NAME,MAJOR_NAME,COLLEGE_NAME
0,African American Studies,L&S Social Sciences Division,Afr Amer Stds-Humanities,Clg of Letters & Science
1,African American Studies,L&S Social Sciences Division,Afr Amer Stds-Social Sci,Clg of Letters & Science
2,African American Studies,L&S Social Sciences Division,African American Studies,Clg of Letters & Science
3,Ag & Env Chem Grad Grp,Clg of Natural Resources,Ag & Environmental Chem,Clg of Natural Resources
4,Ag & Resource Econ & Pol,Clg of Natural Resources,Ag & Resource Economics,Clg of Natural Resources


In [20]:
dpt_df.loc[dpt_df.ACADEMIC_DEPARTMENT_NAME.str.contains('engineering', case=False)]

Unnamed: 0,ACADEMIC_DEPARTMENT_NAME,ACADEMIC_DIVISION_NAME,MAJOR_NAME,COLLEGE_NAME
15,Bioengineering,Clg of Engineering,Bioengineering,Clg of Engineering
16,Bioengineering,Clg of Engineering,Translational Medicine,Clg of Engineering
17,Bioengineering-UCSF Grad Grp,Clg of Engineering,Bioengineering (UCSF),Clg of Engineering
18,Bioengineering-UCSF Grad Grp,Clg of Engineering,Translational Medicine (UCSF),Clg of Engineering
70,Engineering Joint Programs,Clg of Engineering,BioE/MSE Joint Major,Clg of Engineering
71,Engineering Joint Programs,Clg of Engineering,EECS/MSE Joint Major,Clg of Engineering
72,Engineering Joint Programs,Clg of Engineering,EECS/NE Joint Major,Clg of Engineering
73,Engineering Joint Programs,Clg of Engineering,ME/NE Joint Major,Clg of Engineering
74,Engineering Joint Programs,Clg of Engineering,MSE/ME Joint Major,Clg of Engineering
75,Engineering Joint Programs,Clg of Engineering,MSE/NE Joint Major,Clg of Engineering


In [26]:
num_possible_keywords = Y_test.shape[0] * num_top_words
num_predicted_keywords = len(keyword_counter.keys())

In [27]:
assert sum(keyword_counter.values()) == Y_test.shape[0] * num_top_words,\
'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'

In [28]:
unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
unif_keyword_vector

array([12.97012302, 12.97012302, 12.97012302, ..., 12.97012302,
       12.97012302, 12.97012302])

In [29]:
predicted_keyword_vector = np.array(list(keyword_counter.values()))
predicted_keyword_vector

array([41,  1,  1, ...,  2,  5, 48])

In [30]:
assert unif_keyword_vector.shape == predicted_keyword_vector.shape

In [31]:
from scipy.spatial.distance import cosine

def cosine_similarity(x, y):
    return 1 - cosine(x,y)

cosine_similarity(predicted_keyword_vector, unif_keyword_vector)

0.3620480643963293

In [None]:
cosine_similarity([1,-1], [1,1])

---

## Grid Search

In [28]:
import time
import os
import pandas as pd
import numpy as np
from collections import Counter
from itertools import chain
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import ParameterGrid
from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
pd.options.mode.chained_assignment = None 

TRAINING_DIR = os.getcwd()
vectorfile = os.path.join(TRAINING_DIR, 'course_vecs.tsv')
infofile = os.path.join(TRAINING_DIR, 'course_info.tsv')
textcolumn = 'course_description'

In [29]:
def predict(course_vecs, course_descipts, trained_weights, trained_biases, num_words_per_course):
    """
    lalalal
    
    """
    df_with_keywords = course_descipts.copy()
    softmax_frame = course_vecs.iloc[:,1:].dot(trained_weights.values) + trained_biases # make predictions

    # From the softmax predictions, save the top 10 predicted words for each data point
    print('[INFO] Sorting classification results...')
    sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_words_per_course:]

    print('[INFO] Predicting top k inferred keywords for each course...')
    for i in range(num_words_per_course):
        new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
        df_with_keywords['predicted_word_' + str(num_words_per_course-i)] = new_col.values
        
    return df_with_keywords

def calculate_metric(df_with_keywords, metric):
    """
    metrics: {r: recall, p: precision}
    """
    def clean_descrip_title(row):
        punc_remover = str.maketrans('', '', string.punctuation)
        lowered = row['descrip_title'].lower()
        lowered_removed_punc = lowered.translate(punc_remover)
        cleaned_set = set(lowered_removed_punc.split())
        return cleaned_set

    def recall_keywords(row):
        return row['description_title_set'].intersection(row['course_keywords_set'])
    
    prediction_df = df_with_keywords.copy()
    only_predicted_keywords_df = prediction_df[prediction_df.columns.difference(['course_name', 'course_title', 'course_description', 'tf_bias', 'course_alternative_names'])]
    num_keywords_predicted = only_predicted_keywords_df.shape[1]
    prediction_df['course_keywords'] = only_predicted_keywords_df.iloc[:,:].apply(lambda x: ', '.join(x), axis=1)
    prediction_df = prediction_df[['course_name', 'course_title', 'course_description', 'course_keywords', 'course_alternative_names']]
    prediction_df['course_keywords'] = prediction_df['course_keywords'].apply(lambda keywords: ', '.join(sorted(set([word.strip() for word in keywords.split(',')]))))
    prediction_df['course_keywords_set'] = prediction_df['course_keywords'].apply(lambda keywords: (set([word.strip() for word in keywords.split(',')])))
    prediction_df['descrip_title'] = prediction_df['course_title'] + ' ' + prediction_df['course_description']
    prediction_df['description_title_set'] = prediction_df.apply(clean_descrip_title, axis = 1)
    prediction_df['shared_words'] = prediction_df.apply(recall_keywords, axis = 1)
    
    if metric == 'r':
        print('[INFO] Calculating Recall...')
        assert num_keywords_predicted == max_descript_len, 'Number of keywords predicted should equal longest description length'
        prediction_df['recall'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / max_descript_len)
        average_recall = np.mean(prediction_df['recall'])
        return average_recall
    if metric == 'p':
        print('[INFO] Calculating Precision...')
        assert num_keywords_predicted == num_top_words, 'Number of keywords predicted should equal number of predicted words per course'
        prediction_df['precision'] = prediction_df['shared_words'].apply(lambda words: len(list(words)) / num_top_words)
        average_precision = np.mean(prediction_df['precision'])
        return average_precision
    if metric == 'c':
        print('[INFO] Calculating Cosine Similarity Between Keyword Distributions...')
        predicted_keyword_list = only_predicted_keywords_df.values.tolist()
        predicted_keyword_list = list(chain.from_iterable(predicted_keyword_list))
        keyword_counter = Counter(predicted_keyword_list)
        print('[DEBUG] most common keywords: ', keyword_counter.most_common(10))
        
        num_possible_keywords = df_with_keywords.shape[0] * num_top_words
        num_predicted_keywords = len(keyword_counter.keys())
        assert sum(keyword_counter.values()) == split_Y_valid.shape[0] * num_top_words,\
        'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'
        unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
        predicted_keyword_vector = np.array(list(keyword_counter.values()))
        assert unif_keyword_vector.shape == predicted_keyword_vector.shape,\
        'Uniform keyword frequency vector should have same dimension as predicted keywords frequency vector.'
    
        cos_sim = cosine_similarity(predicted_keyword_vector, unif_keyword_vector)
        return cos_sim

def cosine_similarity(x, y):
    return 1 - cosine(x,y)


In [30]:
def get_vocab(dataframe, column, max_df=0.0028, use_idf=True):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[INFO] Number of unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[INFO] Number of bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(bigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[INFO] Number of trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [31]:
def to_bag_of_words(dataframe, column, vocab, tf_bias=.5, use_idf=True):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [32]:
def logistic_regression(X, Y, num_epochs=1):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [33]:
vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
info_frame = pd.read_csv(infofile, sep = '\t') # Course information

nonempty_indices = np.where(info_frame[textcolumn].notnull())[0]
filtered_vec_df = vec_frame.iloc[nonempty_indices,:].reset_index(drop = True)
filtered_descript_df = info_frame.iloc[nonempty_indices,:].reset_index(drop = True)
max_descript_len = max(filtered_descript_df.course_description.str.split().str.len())
num_top_words = 10

hyperparams_cols = ['use_idf', 'max_df','tf-bias', 'num_epochs', 'recall', 'precision', 'distribution_diff']
grid_search_df = pd.DataFrame(columns=hyperparams_cols)

In [34]:
np.arange(0.002, .005, .001)

param_grid = {'use_idf': [True],
              'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
              'tf_bias': np.arange(.5, 1.5, .5), 
              'num_epochs': [5]} 

grid = ParameterGrid(param_grid)

for params in grid:
    print("[HYPERPARAMS] use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.002000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.003000, tf_bias: 1.000000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 0.500000, num_epochs: 5
[HYPERPARAMS] use_idf: True, max_df: 0.004000, tf_bias: 1.000000, num_epochs: 5


In [35]:
param_grid = {'use_idf': [True],
              'max_df': np.arange(0.002, .005, .001), # np.arange(0, .0055, .0005)
              'tf_bias': np.arange(.5, 1.5, .5), 
              'num_epochs': [5]} 

grid = ParameterGrid(param_grid)

recall_validation_scores = []
precision_validation_scores = []
distribution_validation_scores = []

for params in grid:
    print("***[INFO] Evaluating cross-validated model with hyperparams use_idf: %r, max_df: %f, tf_bias: %f, num_epochs: %d***" % 
          (params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs']))

    fold_num = 1
    kf = KFold(n_splits=5, random_state=42) # DO NOT FIX RANDOM STATE WHEN RUNNING THE ACTUAL EXPERIMENT - NVM, should be fixed for reproducibility
    for train_idx, valid_idx in kf.split(filtered_vec_df):
        print('======== [INFO] Fold %d' % (fold_num))
        # X = vectors, Y = descriptions
        split_X_train, split_X_valid = filtered_vec_df.iloc[train_idx], filtered_vec_df.iloc[valid_idx]
        split_Y_train, split_Y_valid = filtered_descript_df.iloc[train_idx], filtered_descript_df.iloc[valid_idx]

        vocab = get_vocab(split_Y_train, textcolumn, max_df=params['max_df'], use_idf=params['use_idf']) 
        vocab_frame = pd.DataFrame(vocab)
        vocabsize = len(vocab)

        # Convert the textcolumn of the raw dataframe into bag of words representation
        split_Y_train_BOW = to_bag_of_words(split_Y_train, textcolumn, vocab, tf_bias=params['tf_bias'], use_idf=params['use_idf'])
        split_Y_train_BOW = split_Y_train_BOW.toarray()

        (weights_frame, biases) = logistic_regression(split_X_train.iloc[:,1:], split_Y_train_BOW, num_epochs=params['num_epochs'])

        print('[INFO] Predicting on validation set for recall...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, max_descript_len)
        fold_i_average_recall = calculate_metric(df_with_keywords, 'r')
        recall_validation_scores.append(fold_i_average_recall)
        print('[INFO] Fold %d recall: %f.' % (fold_num, fold_i_average_recall))

        print('[INFO] Predicting on validation set for precision...')
        df_with_keywords = predict(split_X_valid, split_Y_valid, weights_frame, biases, num_top_words)
        fold_i_average_precision = calculate_metric(df_with_keywords, 'p')
        precision_validation_scores.append(fold_i_average_precision)
        print('[INFO] Fold %d precision: %f.' % (fold_num, fold_i_average_precision))

        fold_i_distribution_diff = calculate_metric(df_with_keywords, 'c')
        distribution_validation_scores.append(fold_i_distribution_diff)
        print('[INFO] Fold %d cosine similarity: %f.' % (fold_num, fold_i_distribution_diff))

        fold_num += 1

    recall_i = np.mean(recall_validation_scores)
    precision_i = np.mean(precision_validation_scores)
    distribution_diff_i = np.mean(distribution_validation_scores)

    model_i_params = [params['use_idf'], params['max_df'], params['tf_bias'], params['num_epochs'], 
                      recall_i, precision_i, distribution_diff_i]
    model_i_params = pd.DataFrame([model_i_params], columns=hyperparams_cols)
    grid_search_df = grid_search_df.append(model_i_params, sort = False)
    print(grid_search_df)
    # print('recall scores:', recall_validation_scores)
    # print('precision scores:', precision_validation_scores)
    # print('distribution scores:', distribution_validation_scores)

***[INFO] Evaluating cross-validated model with hyperparams use_idf: True, max_df: 0.002000, tf_bias: 0.500000, num_epochs: 5***
	 [INFO] Fold 1
[INFO] Getting vocab...
[INFO] Number of unigrams: 11379
[INFO] Number of bigrams: 1137
[INFO] Number of trigrams: 113
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 1 recall: 0.004567.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 1 precision: 0.020054.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[DEBUG] most common keywords:  [('optometric', 222), ('investor', 207), ('masters', 206), ('designated', 199), ('arbitrage', 197), ('board',

[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 1 precision: 0.031030.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[DEBUG] most common keywords:  [('violence', 291), ('professionally', 214), ('negotiation', 198), ('managed', 197), ('candidates', 197), ('debt', 197), ('nonprofit', 197), ('managerial', 197), ('leaders', 174), ('worlds', 158)]
[INFO] Fold 1 cosine similarity: 0.450000.
	 [INFO] Fold 2
[INFO] Getting vocab...
[INFO] Number of unigrams: 11781
[INFO] Number of bigrams: 1178
[INFO] Number of trigrams: 117
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.006574.
[INFO] Predicting on validation set for precision..

[INFO] Number of unigrams: 11781
[INFO] Number of bigrams: 1178
[INFO] Number of trigrams: 117
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.006416.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 2 precision: 0.028136.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[DEBUG] most common keywords:  [('debt', 258), ('company', 258), ('professionally', 258), ('fixed', 257), ('patient', 257), ('ventures', 257), ('derivative', 257), ('ocular', 257), ('craft', 223), ('islam', 207)]
[INFO] Fold 2 cosine similarity: 0.437606.
	 [INFO] Fold 3
[INFO] Getting vocab...
[INFO] Number

Epoch 4/5
Epoch 5/5
[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.008746.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 2 precision: 0.038847.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[DEBUG] most common keywords:  [('debt', 258), ('professionally', 258), ('gsis', 257), ('asset', 257), ('managers', 257), ('ocular', 257), ('patients', 257), ('patient', 257), ('entrepreneurial', 257), ('company', 245)]
[INFO] Fold 2 cosine similarity: 0.425166.
	 [INFO] Fold 3
[INFO] Getting vocab...
[INFO] Number of unigrams: 12015
[INFO] Number of bigrams: 1201
[INFO] Number of trigrams: 120
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 

[INFO] Predicting on validation set for recall...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Recall...
[INFO] Fold 2 recall: 0.008394.
[INFO] Predicting on validation set for precision...
[INFO] Sorting classification results...
[INFO] Predicting top k inferred keywords for each course...
[INFO] Calculating Precision...
[INFO] Fold 2 precision: 0.035322.
[INFO] Calculating Cosine Similarity Between Keyword Distributions...
[DEBUG] most common keywords:  [('gsis', 257), ('debt', 257), ('professionally', 257), ('entrepreneurial', 257), ('ocular', 256), ('managed', 238), ('company', 232), ('patients', 201), ('identities', 181), ('seen', 180)]
[INFO] Fold 2 cosine similarity: 0.434680.
	 [INFO] Fold 3
[INFO] Getting vocab...
[INFO] Number of unigrams: 12015
[INFO] Number of bigrams: 1201
[INFO] Number of trigrams: 120
[INFO] Performing logistic regression...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[INFO

### Remarks:

1. Rename columns to recall@max_len, precision@10 
1. avoid human inspection as metric for now
1. See what aspects of unsupervised methods notebook you can incorporate

### To implement:

1. Distribution difference
    - Calculate cosine distance instead of cosine similarity
    - Calculate the distance from the vector of original frequencies instead of the uniform distribution
    - Calculate the distance from the all 1s vector

1. Calculate document frequency and redundancy metric

1. Master metric = some combination of metrics + regularization (esp for max_df) 



In [12]:
grid_search_df

Unnamed: 0,use_idf,max_df,tf-bias,num_epochs,recall,precision,distribution_diff
0,True,0.002,0.5,5,0.004051,0.018519,0.373661
0,True,0.002,1.0,5,0.004004,0.017651,0.366876
0,True,0.003,0.5,5,0.004589,0.020309,0.368689
0,True,0.003,1.0,5,0.004849,0.021268,0.368771
0,True,0.004,0.5,5,0.005379,0.023874,0.368446
0,True,0.004,1.0,5,0.005716,0.02518,0.368866


In [10]:
grid_search_df.to_csv('grid_search_1.csv')

## optimization

- do not append to dataframes, start w/ lists and convert to dataframe