In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import gzip
import simplejson
import json
import time

  return f(*args, **kwds)


In [35]:
class DataImporter(object):
    def __init__(self, path_to_data):
        # get the raw data
        raw_data = [json.loads(i) for i in gzip.open(path_to_data, "rt")]
        raw_data = raw_data[:20]
        self.data = pd.DataFrame(raw_data)
        self.num_reviews = len(self.data)
        
        # get the counts of users and products
        self.reviewers = self.data["reviewerID"].unique()
        self.num_reviewers = len(self.reviewers)
        self.products = self.data["asin"].unique()
        self.num_products = len(self.products)
        
        # create int-to-name dictionaries for each
        self.reviewer_to_num = {reviewer.lower(): idx for idx, reviewer in enumerate(self.reviewers)}
        self.num_to_reviewer = {self.reviewer_to_num[reviewer]: reviewer for reviewer in self.reviewer_to_num}
        self.product_to_num = {product.lower(): idx for idx, product in enumerate(self.products)}
        self.num_to_product = {self.product_to_num[product]: product for product in self.product_to_num}
        
    def create_train_test_split(self, frac_test):
        # get test data indices
        test_indices = np.random.choice(self.num_reviews,
                                         size=int(self.num_reviews * frac_test),
                                         replace=False)
        
        # split raw data into train/test
        raw_test = self.data.iloc[test_indices, :]
        raw_train = self.data.drop(test_indices).dropna()
        
        # get dimensions of the matrices
        dim = (self.num_reviewers, self.num_products)
        self.test_matrix = self.populate_user_product_review_matrix(dim, raw_test)
        self.train_matrix = self.populate_user_product_review_matrix(dim, raw_train)
        print(self.train_matrix)
        
        self.test = self.permute_matrix(self.test_matrix)
        self.train = self.permute_matrix(self.train_matrix)
        
    def permute_matrix(self, mat):
        perm = []
        for u_idx, row in enumerate(mat):
            for prod_idx, score in enumerate(row):
                if score > 0.0:
                    print(row)
                    print(score)
                    user_cpy = row.copy()
                    score = user_cpy[prod_idx]
                    user_cpy[prod_idx] = 0
                    prod_cpy = mat[:, prod_idx]
                    prod_cpy[u_idx] = 0
                    perm.append({"user": user_cpy,
                                 "reviewerID": self.num_to_reviewer[u_idx],
                                 "product": prod_cpy,
                                 "score": score,
                                 "asin": self.num_to_product[prod_idx]
                                 })
        return perm
    
    def populate_user_product_review_matrix(self, dimensions, dataset):
        assert isinstance(dataset, pd.DataFrame)
        ret_matrix = np.zeros(dimensions)
        for idx, row in dataset.iterrows():
            ret_matrix[self.reviewer_to_num[row["reviewerID"].lower()], self.product_to_num[row["asin"].lower()]] = row["overall"]
        return ret_matrix

In [36]:
data_importer = DataImporter("data/reviews_Amazon_Instant_Video_5.json.gz")
data_importer.create_train_test_split(0.1)


[[ 2.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 4.  0.  0.  0.]
 [ 5.  0.  0.  0.]
 [ 5.  0.  0.  0.]
 [ 0.  3.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  5.  0.  5.]
 [ 0.  3.  0.  0.]
 [ 0.  4.  0.  0.]
 [ 0.  0.  4.  0.]
 [ 0.  0.  3.  0.]
 [ 0.  0.  3.  5.]
 [ 0.  0.  5.  0.]
 [ 0.  0.  2.  0.]
 [ 0.  0.  0.  3.]
 [ 0.  0.  0.  3.]]
[ 5.  0.  0.  0.]
5.0
[ 0.  3.  0.  0.]
3.0
[ 2.  0.  0.  0.]
2.0
[ 1.  0.  0.  0.]
1.0
[ 4.  0.  0.  0.]
4.0
[ 5.  0.  0.  0.]
5.0
[ 5.  0.  0.  0.]
5.0
[ 0.  3.  0.  0.]
3.0
[ 0.  5.  0.  5.]
5.0
[ 0.  0.  0.  5.]
5.0
[ 0.  3.  0.  0.]
3.0
[ 0.  4.  0.  0.]
4.0
[ 0.  0.  4.  0.]
4.0
[ 0.  0.  3.  0.]
3.0
[ 0.  0.  3.  5.]
3.0
[ 0.  0.  0.  5.]
5.0
[ 0.  0.  5.  0.]
5.0
[ 0.  0.  2.  0.]
2.0
[ 0.  0.  0.  3.]
3.0
[ 0.  0.  0.  3.]
3.0


In [13]:
class wide_and_deep(object):
    
    def __init__(self, train, test):
        
        self.train = np.copy(train)
        self.test = np.copy(test)
        
        np.random.shuffle(self.train)
        np.random.shuffle(self.test)
        
        self.num_examples = self.train.shape[0] + self.test.shape[0]
        self.perc_test = self.test.shape[0] / self.train.shape[0]
        
        self.train = self.add_all_features_columns(self.train)
        self.test = self.add_all_features_columns(self.test)
        
    def add_all_features_columns(self, dataset):
        dataset = np.array(list(map(self.add_num_ratings, dataset)) )
        dataset = np.array(list(map(self.add_top_ratings, dataset)) )
        dataset = np.array(list(map(self.add_bottom_ratings, dataset)) )
        dataset = np.array(list(map(self.add_average_ratings, dataset)) )
        dataset = np.array(list(map(self.add_percent_rating, dataset)) )
        
        dataset = pd.DataFrame(list(dataset))
        return dataset
        
    def add_num_ratings(self, data):
        data['num_user_ratings'] = np.count_nonzero(data['user'])
        data['num_movie_ratings'] = np.count_nonzero(data['product'])
        return data
        
    def add_top_ratings(self, data):
        data['top_user_rating'] = np.amax(data['user'])
        data['top_movie_rating'] = np.amax(data['product'])
        return data
    
    def add_bottom_ratings(self, data):
        data['bottom_user_rating'] = np.amin(data['user'])
        data['bottom_movie_rating'] = np.amin(data['product'])
        return data
    
    def add_average_ratings(self, data):
        data['average_user_rating'] = np.average(data['user'])
        data['average_movie_rating'] = np.average(data['product'])
        return data
    
    def add_percent_rating(self, data):
        data['percent_one_star'] = len(np.where(data['user'] == 1.0)[0])/len(data['user'])
        data['percent_two_star'] = len(np.where(data['user'] == 2.0)[0])/len(data['user'])
        data['percent_three_star'] = len(np.where(data['user'] == 3.0)[0])/len(data['user'])
        data['percent_four_star'] = len(np.where(data['user'] == 4.0)[0])/len(data['user'])
        data['percent_five_star'] = len(np.where(data['user'] == 5.0)[0])/len(data['user'])
        return data
        
    def total_ratings(self, data):
        count = 0.0
        for i in data:
            if i['score'] != 0.0:
                count += 1
        print(count)
        
    def build_model(self, deep_layers, deep_layer_activation, wide_output_dim, wide_activation):
        
        N, D = self.train.shape
        
        with tf.name_scope("input"):
            X = tf.placeholder(tf.float32, [None, D], name="x_input")
        
        with tf.name_scope("input_normalization"):
            X = tf.layers.batch_normalization(X)
        
        T = lambda x: tf.matmul(tf.reshape(x, [x.shape[0], 1]), tf.reshape(x, [1, x.shape[0]]))
        cross_prod_mat = tf.map_fn(T, X)
        cross_prod = tf.map_fn(lambda x: tf.reshape(x, [-1]), cross_prod_mat)
        
        # wide part
        wide = tf.layers.dense(cross_prod, wide_output_dim, activation=wide_activation)
        
        deep = X
        
        # deep part
        for i in deep_layers:
            deep = tf.layers.dense(deep, i, activation=deep_layer_activation)
            
        # combine the output of wide and deep
        wide_and_deep = tf.concat([wide, deep], 1)
        logits = tf.layers.dense(wide_and_deep, 5, name="logits")
        
    def add_optimizer(self, learning_rate):
        
        default_graph = tf.get_default_graph()
        logits = default_graph.get_tensor_by_name("logits/BiasAdd:0")
        
        with tf.name_scope("optimizer_input"):
            global_step = tf.placeholder(tf.int64, [], name="global_step")
            y = tf.placeholder(dtype=tf.int64, shape=[None], name="y_input")
        
        with tf.name_scope("cross_entropy"):
                xentropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y, 4), logits=logits)
                with tf.name_scope("loss"):
                    loss = tf.reduce_mean(xentropy, name="loss")
        
        with tf.name_scope("softmax"):
            # Softmax layer
            softmax = tf.nn.softmax(logits)
            # Predict train
            y_hat = tf.argmax(softmax, axis=1, name="y_hat")
            
        with tf.name_scope('train'):
            learning_rate = tf.train.exponential_decay(learning_rate, global_step, 1000, 0.96, staircase=True)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
            training_op = optimizer.minimize(xentropy, name="training_op")
            
        with tf.name_scope("accuracy"):
            # Train Evaluation
            train_correct = tf.equal(y, y_hat)
            tf.reduce_mean(tf.cast(train_correct, tf.float64), name="acc")
            
    def train_model(self, sess, n_epochs, batch_size):
        
        # New***
        init = tf.global_variables_initializer() 

        # get default graph
        default_graph = tf.get_default_graph()
        
        # Get individual graph tensors
        X = default_graph.get_tensor_by_name("input/x_input:0")
        Y = default_graph.get_tensor_by_name("optimizer_input/y_input:0")
        global_step = default_graph.get_tensor_by_name("optimizer_input/global_step:0")
        
        # Get specific operations in graph
        training_op = default_graph.get_operation_by_name("train/training_op")
        accuracy = default_graph.get_tensor_by_name("accuracy/acc:0")
        loss = default_graph.get_tensor_by_name("cross_entropy/loss/loss:0")
        
        # Create logdir
        now = time.strftime("%Y%m%d%H%M%S")
        root_logdir = "tf_logs"
        logdir = "{root_logdir}/run"
        
        # Create Tensorboard file_writers
        summary_writer = tf.summary.FileWriter(logdir, default_graph)
        
        # Create Tensorboard Summaries 
        acc_train_summary = tf.summary.scalar("train_accuracy", accuracy)
        acc_test_summary = tf.summary.scalar("test_accuracy", accuracy)
        loss_summary = tf.summary.scalar('train_loss', loss)
        
        # Create Tensorboard histograms
        conv_histograms = []
        for i in tf.global_variables():
            if i.name[:6] == "conv2d":
                # need to switch colon with underscore because colons aren't allowed in summary names
                conv_histograms.append(tf.summary.histogram(i.name.replace(":", "_"), i))
        
        # Wrapper tells Tensorboard to assume everything is within same session
        sess.run(init)

        step = 0
        for epoch in range(1, n_epochs + 1):
            n_iters = int((self.num_examples * (1 - self.perc_test)) / batch_size)
            print_interval = 100
            best_test = 0.

            print(f"\nStarting epoch {epoch}, running for {n_iters} iterations")
            print(f"Printing evaluation metrics every {print_interval} iterations\n")

            for iteration in range(1, n_iters):
                step += 1
                indices = np.random.choice(int(self.num_examples * (1 - self.perc_test)), batch_size)
                X_batch = self.train[indices]
                y_batch = self.train[indices]
                sess.run(training_op, feed_dict={X: X_batch,
                                                 Y: y_batch,
                                                 global_step: step})

                if iteration % print_interval == 0:

                    acc_train, train_summary = sess.run([accuracy, tf.summary.merge([loss_summary, acc_train_summary])],
                                                        {X: X_batch, Y: y_batch})

                    acc_test, test_summary = sess.run([accuracy, acc_test_summary], {X: self.test_x, Y: self.test_y})

                    # Add batch train loss & accuracy to Tensorboard
                    merged_histograms = tf.summary.merge([i.eval(session=sess) for i in conv_histograms])
                    summary_writer.add_summary(tf.summary.merge([train_summary, test_summary, merged_histograms]).eval(session=sess), step)
                    summary_writer.flush()

                    # Print out batch evaluation metrics
                    print("Iteration: {} train acc: {:.4f} test acc: {:.4f}".format(iteration, 
                                                                                    acc_train, 
                                                                                    acc_test))
        # Ensure file_writers closed
        summary_writer.close() 

In [14]:
tf.reset_default_graph()
wad = wide_and_deep(data_importer.train, data_importer.test)

In [15]:
wad.build_model([128, 128], tf.nn.relu, 256, tf.nn.relu)

In [16]:
wad.add_optimizer(0.001)

In [24]:
sess = tf.Session()
wad.train_model(sess, 2, 1)


Starting epoch 1, running for 1777 iterations
Printing evaluation metrics every 100 iterations



KeyError: '[1697] not in index'

In [19]:
wad.train.where(wad.train["average_movie_rating"] > 0.0).dropna()

Unnamed: 0,asin,average_movie_rating,average_user_rating,bottom_movie_rating,bottom_user_rating,num_movie_ratings,num_user_ratings,percent_five_star,percent_four_star,percent_one_star,percent_three_star,percent_two_star,product,reviewerID,score,top_movie_rating,top_user_rating,user
