In [11]:
from keywords_research_helper import *
import time
import pandas as pd
import numpy as np
import nltk
import re
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.layers import Input, Dense
from keras.models import Model
from keras import backend as K
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

In [12]:
TRAINING_DIR = '/home/matthew/Models-AskOski/ICS/'
vectorfile = os.path.join(TRAINING_DIR, 'course_vecs_temp.tsv')
rawfile = os.path.join(TRAINING_DIR, 'course_info_temp.tsv')
textcolumn = 'course_description'
use_idf = True
tf_bias = .5
num_epochs = 5
num_top_words = 10
max_df = 0.0028

---

In [None]:
def get_vocab(dataframe, column):
    print("[INFO] Getting vocab...")

    dataframe[column] = dataframe[column].fillna('')
    
    # max_df_param = 0.0028  # 1.0 # 0.0036544883

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(1,1), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    unigrams = vectorizer.get_feature_names()
    print('[UNIGRAMS] number unigrams: %d' % (len(unigrams)))
    
    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(2,2), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    bigrams = vectorizer.get_feature_names()
    print('[BIGRAMS] number bigrams: %d' % (len(bigrams)))

    vectorizer = TfidfVectorizer(max_df = max_df, stop_words='english', ngram_range=(3,3), max_features=max(1, int(len(unigrams)/10)), use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column])
    trigrams = vectorizer.get_feature_names()
    print('[TRIGRAMS] number trigrams: %d' % (len(trigrams)))

    vocab = np.concatenate((unigrams, bigrams, trigrams))
    vocab_list = list(vocab)
    removed_numbers_list = [word for word in vocab_list if not any(char.isdigit() for char in word)]
    vocab = np.array(removed_numbers_list)
#     pd.DataFrame(vocab).to_csv(outputfile+'_vocab.tsv', sep = '\t', encoding='utf-8', index = False)
    return vocab

In [None]:
def to_bag_of_words(dataframe, column, vocab):
    """Input: raw dataframe, text column, and vocabulary.
    Returns a sparse matrix of the bag of words representation of the column."""
    vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vocab, use_idf=use_idf)
    X = vectorizer.fit_transform(dataframe[column].values.astype('U'))
    if tf_bias == -999:
        return X
    return (X.multiply(1/X.count_nonzero())).power(-tf_bias)

In [None]:
def logistic_regression(X, Y):
    print('[INFO] Performing logistic regression...')

    inputs = Input(shape=(X.shape[1],))
#     print('input shape: ', X.shape[1])  # 300 = number of cols in the feature matrix?
#     print('vocab size: ', vocabsize) # 2400 = len(get_vocab(raw_frame, textcolumn)) = num words parsed from description corpus
#     x = Dense(30, activation='sigmoid')(inputs)
#     predictions = Dense(vocabsize, activation='softmax')(x)
    predictions = Dense(vocabsize, activation='softmax')(inputs)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    model.fit(X, Y, epochs=num_epochs)
    weights = model.layers[1].get_weights()[0]
    biases = model.layers[1].get_weights()[1]
    weights_frame = pd.DataFrame(weights)
    biases_frame = pd.DataFrame(biases)
    return(weights_frame, biases)

In [None]:
def clean_descrip_title(row):
    punc_remover = str.maketrans('', '', string.punctuation)
    lowered = row['descrip_title'].lower()
    lowered_removed_punc = lowered.translate(punc_remover)
    cleaned_set = set(lowered_removed_punc.split())
    return cleaned_set

def recall_keywords(row):
    return row['description_title_set'].intersection(row['keywords_set'])

---

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
vec_frame = pd.read_csv(vectorfile, sep = '\t') # Vector space representation of each user, all numeric
raw_frame = pd.read_csv(rawfile, sep = '\t') # Course information

nonempty_indices = np.where(raw_frame[textcolumn].notnull() == True)[0]
filtered_vec_frame = vec_frame.iloc[nonempty_indices,:]
filtered_raw_frame = raw_frame.iloc[nonempty_indices,:]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(filtered_vec_frame, filtered_raw_frame, test_size=0.2, random_state=42)

print(X_train.shape[0], X_test.shape[0])

In [None]:
vocab = get_vocab(Y_train, textcolumn) # get_vocab(raw_frame, textcolumn) 
vocab_frame = pd.DataFrame(vocab)
    
vocabsize = len(vocab)

# Convert the textcolumn of the raw dataframe into bag of words representation
Y_train_BOW = to_bag_of_words(Y_train, textcolumn, vocab)
Y_train_BOW = Y_train_BOW.toarray()
Y_train_BOW

In [None]:
(weights_frame, biases) = logistic_regression(X_train.iloc[:,1:], Y_train_BOW)

In [None]:
softmax_frame = X_test.iloc[:,1:].dot(weights_frame.values) + biases
print('[INFO] Sorting classification results...')
sorted_frame = np.argsort(softmax_frame,axis=1).iloc[:,-num_top_words:]

predicted_keyword_list = []
for i in range(num_top_words):
    new_col = vocab_frame.iloc[sorted_frame.iloc[:,i],0] # get the ith top vocab word for each entry
    predicted_keyword_list.extend(new_col.values)
    Y_test['predicted_word_' + str(num_top_words-i)] = new_col.values


In [None]:
Y_test.head(3)

In [None]:
from collections import Counter

keyword_counter = Counter(predicted_keyword_list)

keyword_counter.most_common(10)

In [None]:
num_possible_keywords = Y_test.shape[0] * num_top_words
num_predicted_keywords = len(keyword_counter.keys())

In [None]:
assert sum(keyword_counter.values()) == Y_test.shape[0] * num_top_words,\
'Total number of predicted keywords should equal number of courses * number of predicted keywords per course.'

In [None]:
unif_keyword_vector = np.repeat(num_possible_keywords / num_predicted_keywords, num_predicted_keywords)
unif_keyword_vector

In [None]:
predicted_keyword_vector = np.array(list(keyword_counter.values()))
predicted_keyword_vector

In [None]:
assert unif_keyword_vector.shape == predicted_keyword_vector.shape

In [None]:
from scipy.spatial.distance import cosine

def cosine_similarity(x, y):
    return 1 - cosine(x,y)

cosine_similarity(predicted_keyword_vector, unif_keyword_vector)

In [None]:
cosine_similarity([1,-1], [1,1])