In [1]:
# preprocessing imports
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# functions we implemented
from custom_functions import init_embeddings_map, get_embed_and_pad_func

In [3]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [4]:
raw_data = pd.read_csv("data/unembedded_grouped_cleaned_data.csv")

In [5]:
train, test = train_test_split(raw_data, test_size=0.1, shuffle=True)

In [6]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply(lambda x: x.split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply(lambda x: x.split()).apply(len)

In [7]:
u_ptile = 50
i_ptile = 20
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [8]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

# DeepCoNN Recommendation Model

In [51]:
# modeling imports
import tensorflow as tf
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.layers import Input, Dense
from keras.layers.merge import Add, Dot, Concatenate

In [68]:
class DeepCoNN():
    def __init__(self,
                 embedding_size,
                 hidden_size,
                 u_seq_len,
                 m_seq_len,
                 filters=2,
                 kernel_size=10,
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        self.joined = Concatenate()([self.towerU, self.towerM])
        self.outNeuron = Dense(1)(self.joined)

    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        tower = Conv1D(filters=self.filters,
                       kernel_size=self.kernel_size,
                       activation="tanh")(input_layer)
        tower = MaxPooling1D()(tower)
        tower = Flatten()(tower)
        tower = Dense(self.hidden_size, activation="relu")(tower)
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        output = Add()([self.outNeuron, dotproduct])
        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, batch_size, epochs=3500):
        tensorboard = TensorBoard(log_dir="tf_logs/{}".format(pd.Timestamp(int(time()), unit="s")))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))

        self.train_inputs = [user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]
        
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        
        

In [None]:
hidden_size = 64
deepconn = DeepCoNN(emb_size, hidden_size, u_seq_len, i_seq_len)

batch_size = 32
deepconn.train(train_embedded, batch_size, epochs=50)

# deepconn.model.save("cnn.h5")

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 312, 50)      0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           (None, 985, 50)      0                                            
__________________________________________________________________________________________________
conv1d_21 (Conv1D)              (None, 303, 2)       1002        input_21[0][0]                   
__________________________________________________________________________________________________
conv1d_22 (Conv1D)              (None, 976, 2)       1002        input_22[0][0]                   
__________________________________________________________________________________________________
max_poolin

In [40]:

user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_inputs = [user_reviews, movie_reviews]

true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions = deepconn.model.predict(test_inputs)

error = np.absolute(predictions - true_rating)

In [41]:
largest_error = np.argmax(error)
smallest_error = np.argmin(error)

In [42]:
print("user gave rating:", test.iloc[largest_error, :]["overall"], "while we predicted", predictions[largest_error])
test.iloc[largest_error, :]["userReviews"]

user gave rating: 1.0 while we predicted [ 4.75031042]


"this is a really fun show i like it because it has nothing to do with day to day life i just think it's really a good show that keeps getting better it is great that i can watch past seasons on amazon as i often see things i missed before it is one of my favorite shows on tv i can't wait for it to start again on mtv june 23 it is great that i can watch past seasons on amazon as their are a lot of things see that i missed last time i watched one thing i recently saw was the first time scott's eyes glow red is in season 1 episode 7 and it only happens for a second when his alpha who he doesn't know is yet calls him out in the school gym to kill his friends what's the latin translation for what the demon said roughly alli i octe es octe is eight and there are eight of them so it's saying something about the eight but what more please lots of potential with many unanswered questions for future episodes the cast was incredible no b movie actors here"

In [43]:
test.iloc[largest_error, :]["movieReviews"]

"the first season was fun but i've been losing interest in this highly scripted 34 reality 34 series this may still be highly interesting to 13 year old kids but as a guy a bit older than si the writing just seems aimed at more of a three stooges audience i watched this show right up until phil's interview with gq how can anyone anyone claim that african americans were 34 happy happy happy 34 under the jim crow laws how can anyone anyone be so hateful about people who identify as lgbt i will never watch again feel free to hate on this review just because some people agree with him doesn't make it right very funny and interesting my husband really enjoys watching the videos and keeps watching as long as there are more of them always makes me laugh not many comedies on tv can claim that fame for me love the lesson sum up at the end jase is the sexy star can't get enough of him however the others are all very good and funny i like the prayer before their meals my whole family young and ol