In [1]:
# Import libraries

import tensorflow as tf

from tensorflow import keras

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, DateType

In [2]:
df = spark.read.csv("data/mldataset/ratings.dat", sep=':')
df.show(100,truncate=True)

+---+----+----+----+---+----+---------+
|_c0| _c1| _c2| _c3|_c4| _c5|      _c6|
+---+----+----+----+---+----+---------+
|  1|null|1193|null|  5|null|978300760|
|  1|null| 661|null|  3|null|978302109|
|  1|null| 914|null|  3|null|978301968|
|  1|null|3408|null|  4|null|978300275|
|  1|null|2355|null|  5|null|978824291|
|  1|null|1197|null|  3|null|978302268|
|  1|null|1287|null|  5|null|978302039|
|  1|null|2804|null|  5|null|978300719|
|  1|null| 594|null|  4|null|978302268|
|  1|null| 919|null|  4|null|978301368|
|  1|null| 595|null|  5|null|978824268|
|  1|null| 938|null|  4|null|978301752|
|  1|null|2398|null|  4|null|978302281|
|  1|null|2918|null|  4|null|978302124|
|  1|null|1035|null|  5|null|978301753|
|  1|null|2791|null|  4|null|978302188|
|  1|null|2687|null|  3|null|978824268|
|  1|null|2018|null|  4|null|978301777|
|  1|null|3105|null|  5|null|978301713|
|  1|null|2797|null|  4|null|978302039|
|  1|null|2321|null|  3|null|978302205|
|  1|null| 720|null|  3|null|978300760|


In [3]:
# Let's add headers and drop empty columns

headerd_schema = StructType([
    StructField("UserId", IntegerType()),
    StructField("Null1", StringType()),
    StructField("MovieId", IntegerType()),
    StructField("Null2", StringType()),
    StructField("Rating", IntegerType()),
    StructField("Null3", StringType()),
    StructField("TimeStamp", IntegerType())
])

df = spark.read.schema(headerd_schema).csv("data/mldataset/ratings.dat", sep=":",header=False)
df.show()

+------+-----+-------+-----+------+-----+---------+
|UserId|Null1|MovieId|Null2|Rating|Null3|TimeStamp|
+------+-----+-------+-----+------+-----+---------+
|     1| null|   1193| null|     5| null|978300760|
|     1| null|    661| null|     3| null|978302109|
|     1| null|    914| null|     3| null|978301968|
|     1| null|   3408| null|     4| null|978300275|
|     1| null|   2355| null|     5| null|978824291|
|     1| null|   1197| null|     3| null|978302268|
|     1| null|   1287| null|     5| null|978302039|
|     1| null|   2804| null|     5| null|978300719|
|     1| null|    594| null|     4| null|978302268|
|     1| null|    919| null|     4| null|978301368|
|     1| null|    595| null|     5| null|978824268|
|     1| null|    938| null|     4| null|978301752|
|     1| null|   2398| null|     4| null|978302281|
|     1| null|   2918| null|     4| null|978302124|
|     1| null|   1035| null|     5| null|978301753|
|     1| null|   2791| null|     4| null|978302188|
|     1| nul

In [4]:
df = df.drop('Null1', 'Null2', 'Null3')

In [5]:
df.orderBy('TimeStamp').show()

+------+-------+------+---------+
|UserId|MovieId|Rating|TimeStamp|
+------+-------+------+---------+
|  6040|    858|     4|956703932|
|  6040|    593|     5|956703954|
|  6040|   2384|     4|956703954|
|  6040|   1961|     4|956703977|
|  6040|   2019|     5|956703977|
|  6040|   3111|     5|956704056|
|  6040|    573|     4|956704056|
|  6040|   3505|     4|956704056|
|  6040|    213|     5|956704056|
|  6040|   1419|     3|956704056|
|  6040|   1734|     2|956704081|
|  6040|   2503|     5|956704191|
|  6040|    919|     5|956704191|
|  6040|    912|     5|956704191|
|  6040|    527|     5|956704219|
|  6040|    649|     5|956704257|
|  6040|    318|     4|956704257|
|  6040|   1252|     5|956704257|
|  6040|   3289|     5|956704305|
|  6040|    759|     5|956704448|
+------+-------+------+---------+
only showing top 20 rows



In [7]:
df = pd.read_csv("data/mldataset/ratings.dat", sep="::",header=None, engine='python')
df.columns = ['UserId','MovieId','Rating','TimeStamp']

In [9]:
n_users = len(df['UserId'].unique())
n_movies = len(df['MovieId'].unique())

n_features = 50

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(1000, 64, input_length=10))
model.add(keras.layers.Dropout(0.05))
model.add(keras.layers.Dense(units = 150, activation = 'relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(units = 50, activation = 'softmax'))


model.compile(optimizer='adam',
              loss='mse',
              metrics=['accuracy'])
