In [13]:
# preprocessing imports
import os.path
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence
import re

<img src="picture/image-20221030160556248.png" alt="image-20221030160556248" style="zoom:50%;" />

<img src="picture/image-20221030160442980.png" alt="image-20221030160442980" style="zoom:50%;" />

In [22]:
def init_embeddings_map(fname):
    with open(os.path.join("data", "glove.6B", fname),encoding='UTF-8') as glove:
        return {l[0]: np.asarray(l[1:], dtype="float32") for l in
                [line.split() for line in glove]}

In [23]:
emb_size = 50
embedding_map = init_embeddings_map("glove.6B." + str(emb_size) + "d.txt")

In [97]:
print(list(embedding_map.items())[:5])
#print(embedding_map.get('['))

[('the', array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)), (',', array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
       -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
       -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
       -0.41634 , -0.1

In [25]:
raw_data = pd.read_csv("../data/unembedded_grouped_cleaned_data.csv")

In [26]:
print(raw_data.head(5))

   Unnamed: 0      reviewerID        asin  overall  \
0           0  A11N155CW1UV02  B000H00VBQ      2.0   
1           1  A3BC8O2KCL29V2  B000H00VBQ      5.0   
2           2   A60D5HQFOTSOM  B000H00VBQ      1.0   
3           3  A1RJPIGRSNX4PW  B000H00VBQ      4.0   
4           4  A16XRPF40679KG  B000H00VBQ      5.0   

                                         userReviews  \
0  ['i', 'really', 'like', 'the', 'characters', '...   
1  ['this', 'is', 'one', 'good', 'show', 'it', 'i...   
2  ['i', 'watched', 'this', 'a', 'couple', 'of', ...   
3  ['the', 'acting', 'was', 'excellent', 'the', '...   
4  ['as', 'many', 'people', 'said', 'this', 'show...   

                                        movieReviews  
0  ['this', 'show', 'always', 'is', 'excellent', ...  
1  ['i', 'had', 'big', 'expectations', 'because',...  
2  ['i', 'had', 'big', 'expectations', 'because',...  
3  ['i', 'had', 'big', 'expectations', 'because',...  
4  ['i', 'had', 'big', 'expectations', 'because',...  


In [27]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

In [83]:
def filter_jud(n):
    if n==''or n==' 'or n==r']' or n==r'[':
        return False
    return True

In [84]:
user_seq_sizes = raw_data.loc[:, "userReviews"].apply( lambda x: list(filter(filter_jud,re.split(r',|\'',str(x)))) ).apply(len)#lambda x: str(x).split()).apply(len)
item_seq_sizes = raw_data.loc[:, "movieReviews"].apply( lambda x: list(filter(filter_jud,re.split(r',|\'',str(x)))) ).apply(len)

In [85]:
u_ptile = 40
i_ptile = 15
u_seq_len = int(np.percentile(user_seq_sizes, u_ptile))
i_seq_len = int(np.percentile(item_seq_sizes, i_ptile))

In [86]:
print(u_seq_len)

252


In [95]:
def get_embed_and_pad_func(i_seq_len, u_seq_len, pad_value, embedding_map):
    def embed(row):
        sentence = list(filter(filter_jud,re.split(r',|\'',str(row["userReviews"]))))[:u_seq_len]
        reviews = list(map(lambda word: embedding_map.get(word)
            if word in embedding_map else pad_value, sentence))
        row["userReviews"] = reviews +[pad_value] * (u_seq_len - len(sentence))
        sentence = list(filter(filter_jud,re.split(r',|\'',str(row["movieReviews"]))))[:i_seq_len]
        reviews = list(map(lambda word: embedding_map.get(word)
            if word in embedding_map else pad_value, sentence))
        row["movieReviews"] = reviews +[pad_value] * (i_seq_len - len(sentence))
        return row
    return embed

In [96]:
embedding_fn = get_embed_and_pad_func(i_seq_len, u_seq_len, np.array([0.0] * emb_size), embedding_map)
    
train_embedded = train.apply(embedding_fn, axis=1)
test_embedded = test.apply(embedding_fn, axis=1)

codes below are tested

In [109]:
#print(train.head(5))
print(train_embedded["userReviews"].head(10))
print(train_embedded.loc[0,"userReviews"])

0    [[0.11891, 0.15255, -0.082073, -0.74144, 0.759...
1    [[0.53074, 0.40117, -0.40785, 0.15444, 0.47782...
2    [[0.11891, 0.15255, -0.082073, -0.74144, 0.759...
3    [[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -...
4    [[0.20782, 0.12713, -0.30188, -0.23125, 0.3017...
5    [[1.3031, 1.2902, 0.10244, -1.2569, 0.86197, -...
6    [[0.57049, -0.0077854, -0.70766, -0.31785, 0.8...
7    [[0.11891, 0.15255, -0.082073, -0.74144, 0.759...
8    [[0.19253, 0.10006, 0.063798, -0.087664, 0.522...
9    [[-0.38796, 0.20422, 0.31733, -0.41985, 0.4740...
Name: userReviews, dtype: object
[array([ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
       -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
       -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
        6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
       -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
       -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+0

In [105]:
sentence = list(filter(filter_jud,re.split(r',|\'',str(train.loc[0,"userReviews"]))))[0:5]#str(train["userReviews"]).split()[:1]
print(sentence)
pad_value = np.array([0.0] * emb_size)
#ans = reviews + [pad_value] * (u_seq_len - len(sentence))
#reviews = np.concatenate((reviews,[pad_value] * (u_seq_len - len(sentence))),axis=0)
#print(reviews)
#print(reviews[1])


['i', 'really', 'like', 'the', 'characters']


In [106]:
reviews = list(map(lambda word: embedding_map.get(word) if word in embedding_map else pad_value, sentence)) 
print(reviews)

[array([ 1.1891e-01,  1.5255e-01, -8.2073e-02, -7.4144e-01,  7.5917e-01,
       -4.8328e-01, -3.1009e-01,  5.1476e-01, -9.8708e-01,  6.1757e-04,
       -1.5043e-01,  8.3770e-01, -1.0797e+00, -5.1460e-01,  1.3188e+00,
        6.2007e-01,  1.3779e-01,  4.7108e-01, -7.2874e-02, -7.2675e-01,
       -7.4116e-01,  7.5263e-01,  8.8180e-01,  2.9561e-01,  1.3548e+00,
       -2.5701e+00, -1.3523e+00,  4.5880e-01,  1.0068e+00, -1.1856e+00,
        3.4737e+00,  7.7898e-01, -7.2929e-01,  2.5102e-01, -2.6156e-01,
       -3.4684e-01,  5.5841e-01,  7.5098e-01,  4.9830e-01, -2.6823e-01,
       -2.7443e-03, -1.8298e-02, -2.8096e-01,  5.5318e-01,  3.7706e-02,
        1.8555e-01, -1.5025e-01, -5.7512e-01, -2.6671e-01,  9.2121e-01],
      dtype=float32), array([ 1.6675e-03, -1.6376e-01, -9.2648e-02, -3.3466e-01,  7.3972e-01,
       -2.3523e-01, -3.4941e-01,  1.9102e-01, -4.2223e-01,  5.8440e-01,
       -2.7604e-01,  4.6605e-01, -9.7154e-01,  3.5971e-02,  8.9279e-01,
        5.0195e-01,  8.9409e-01,  3.5050

# DeepCoNN Recommendation Model

In [99]:
# modeling imports
import tensorflow as tf
from keras.models import Model
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Conv1D, MaxPooling1D, Flatten
from keras.layers import Input, Dense
from keras.layers.merge import Add, Dot, Concatenate

<img src="picture/CNN.png" alt="CNN" style="zoom:100%;" />

<img src="picture/image-20221030155105653.png" alt="image-20221030155105653" style="zoom:30%;" />

<img src="picture/pooling.png" alt="pooling" style="zoom:60%;" />

<img src="picture/image-20221027112255535.png" alt="image-20221027112255535" style="zoom:30%;" />

In [100]:
class DeepCoNN():
    def __init__(self,
                 embedding_size,
                 hidden_size,
                 u_seq_len,
                 m_seq_len,
                 filters=2,
                 kernel_size=10, # kernel size 10*50
                 strides=6):
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.filters = filters
        self.kernel_size = kernel_size
        self.inputU, self.towerU = self.create_deepconn_tower(u_seq_len)
        self.inputM, self.towerM = self.create_deepconn_tower(m_seq_len)
        self.joined = Concatenate()([self.towerU, self.towerM])
        self.outNeuron = Dense(1)(self.joined)

    def create_deepconn_tower(self, max_seq_len):
        input_layer = Input(shape=(max_seq_len, self.embedding_size))
        tower = Conv1D(filters=self.filters,
                       kernel_size=self.kernel_size,
                       activation="tanh")(input_layer)
        tower = MaxPooling1D()(tower)
        tower = Flatten()(tower)
        tower = Dense(self.hidden_size, activation="relu")(tower)
        return input_layer, tower

    def create_deepconn_dp(self):
        dotproduct = Dot(axes=1)([self.towerU, self.towerM])
        output = Add()([self.outNeuron, dotproduct])
        self.model = Model(inputs=[self.inputU, self.inputM], outputs=[output])
        self.model.compile(optimizer='Adam', loss='mse')
        
    def train(self, train_data, batch_size, epochs=3500):
        tensorboard = TensorBoard()#log_dir="tf_logs/{}".format(pd.Timestamp(int(time()), unit="s"))
        self.create_deepconn_dp()
        print(self.model.summary())
        
        user_reviews = np.array(list(train_data.loc[:, "userReviews"]))
        movie_reviews = np.array(list(train_data.loc[:, "movieReviews"]))

        self.train_inputs = [user_reviews, movie_reviews]
        self.train_outputs = train_data.loc[:, "overall"]
        
        self.history = self.model.fit(self.train_inputs,
                                      self.train_outputs,
                                      callbacks=[tensorboard],
                                      validation_split=0.05,
                                      batch_size=batch_size,
                                      epochs=epochs)
        
        

In [114]:
hidden_size = 256
deepconn = DeepCoNN(emb_size, hidden_size, u_seq_len, i_seq_len)

batch_size = 32
deepconn.train(train_embedded, batch_size, epochs=5)

deepconn.model.save("cnn.h5")

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 252, 50)]    0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 758, 50)]    0           []                               
                                                                                                  
 conv1d_6 (Conv1D)              (None, 243, 2)       1002        ['input_7[0][0]']                
                                                                                                  
 conv1d_7 (Conv1D)              (None, 749, 2)       1002        ['input_8[0][0]']                
                                                                                            

In [115]:
user_reviews = np.array(list(test_embedded.loc[:, "userReviews"]))
movie_reviews = np.array(list(test_embedded.loc[:, "movieReviews"]))

test_inputs = [user_reviews, movie_reviews]

# dat = pd.DataFrame(test_inputs)
# dat.to_csv("data/test_data.csv")

In [116]:
true_rating = np.array(list(test_embedded.loc[:, "overall"])).reshape((-1, 1))

predictions = deepconn.model.predict(test_inputs)

error = np.square(predictions - true_rating)

print("MSE:", np.average(error))


MSE: 1.3447838835504047


In [104]:
output = np.concatenate((predictions,true_rating),axis=1)
print(output)

[[4.79049349 3.        ]
 [4.95386696 5.        ]
 [4.17135191 1.        ]
 [4.06208944 5.        ]
 [4.38356161 5.        ]
 [3.64310002 5.        ]
 [3.56123805 3.        ]
 [4.40784454 5.        ]
 [3.92939067 5.        ]
 [4.95962954 4.        ]
 [3.8214159  4.        ]
 [3.94046354 4.        ]
 [3.76577306 4.        ]
 [4.08056021 5.        ]
 [4.57791376 5.        ]
 [4.3059926  5.        ]
 [5.10263157 5.        ]
 [4.00244188 5.        ]
 [4.02105284 5.        ]
 [4.1712327  1.        ]
 [4.33652973 5.        ]
 [4.23451519 4.        ]
 [5.09906769 5.        ]
 [4.70152044 5.        ]
 [3.74713302 4.        ]
 [3.98995304 2.        ]
 [4.70068693 5.        ]
 [3.77823186 5.        ]
 [3.73325586 5.        ]
 [4.26664591 4.        ]
 [3.84562612 5.        ]
 [4.39169407 5.        ]
 [4.40557051 5.        ]
 [4.47122812 5.        ]
 [4.38040686 5.        ]
 [3.56665754 4.        ]
 [4.27680492 5.        ]
 [4.59419394 5.        ]
 [4.09782457 4.        ]
 [4.66657877 5.        ]
