In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model

# Step 1
### Importing the data

In [2]:
# load ratings data
data0 = pd.read_table("./archive/ml-100k/u.data", delimiter="\t", names=["userId", "itemId", "rating", "timestamp"])
data0.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# load user data
users0 = pd.read_table("./archive/ml-100k/u.user", delimiter="|", names=["userId", "age", "gender", "occupation", "zip"])
users0.head()

Unnamed: 0,userId,age,gender,occupation,zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# load movie data
movies0 = pd.read_table("./archive/ml-100k/u.item", delimiter="|", names=["movieId", "title", "movieRelease", "videoRelease", "imbd", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western"])
movies0.head()

Unnamed: 0,movieId,title,movieRelease,videoRelease,imbd,unknown,action,adventure,animation,children,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Step 2:
### Cleaning the data

In [5]:
def convertDates(series):
    monthDict = {"Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun" : 6, "Jul" : 7, "Aug" : 8, "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}
    dates = []
    for val in series:
#         print(val)
        if (type(val) != type(0.1)):
            date = val.split("-")
            numDate = int(date[0]) + 100 * monthDict[date[1]] + 10000 * int(date[2])
            dates.append(numDate)
        else:
            dates.append(-1)
    return dates

def convertZips(series):
    zips = []
    for val in series:
        try:
            zips.append(int(val))
        except:
            zips.append(-1)
    return zips

In [6]:
# clean data
data = data0.drop("timestamp", axis=1)
data.rating.replace({3:1, 4:1, 5:1, 1:0, 2:0}, inplace=True)
data.head()

Unnamed: 0,userId,itemId,rating
0,196,242,1
1,186,302,1
2,22,377,0
3,244,51,0
4,166,346,0


In [7]:
# replace type of zip
users = users0.copy()
users.zip = convertZips(users.zip)
# replace gender for numbers
users.gender.replace({"M" : -1, "F" : 1}, inplace=True)
# replace occupations
cols = list(users.occupation.unique())
users.occupation.replace(cols, np.arange(len(cols)), inplace=True)

users.head()

Unnamed: 0,userId,age,gender,occupation,zip
0,1,24,-1,0,85711
1,2,53,1,1,94043
2,3,23,-1,2,32067
3,4,24,-1,0,43537
4,5,33,1,1,15213


In [8]:
# drop imbd and videoRelease and title
movies = movies0.drop(["videoRelease", "imbd", "title"], axis=1)
# change release date
movies.movieRelease = convertDates(movies.movieRelease)

movies.head()

Unnamed: 0,movieId,movieRelease,unknown,action,adventure,animation,children,comedy,crime,documentary,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,19950101,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,19950101,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,19950101,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,19950101,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,19950101,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
xUsers = users.to_numpy()[:, 1:]
xMovies = movies.to_numpy()[:, 1:]
xData = data.to_numpy()[:, 2]

rows = len(data)
cols = np.shape(xUsers)[1] + np.shape(xMovies)[1] + 1

X = np.zeros((rows, cols))

userIds = data.userId.values
movieIds = data.itemId.values
ratings = data.rating.values

for i in range(rows):
#     print(i)
    u = userIds[i]
    m = movieIds[i]
    X[i] = np.hstack((xUsers[u - 1, :], xMovies[m - 1, : ], ratings[i]))

DATA = pd.DataFrame(X, columns=["age", "gender", "occupation", "zip", "movieRelease", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western", "rating"])
DATA = DATA[DATA["zip"] != -1]
DATA = DATA[DATA["movieRelease"] != -1]

X = DATA.to_numpy()
DATA.head()

Unnamed: 0,age,gender,occupation,zip,movieRelease,unknown,action,adventure,animation,children,...,noir,horror,musical,mystery,romance,scifi,thriller,war,western,rating
0,49.0,-1.0,2.0,55105.0,19970124.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,39.0,1.0,3.0,0.0,19970101.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,25.0,-1.0,2.0,40206.0,19940101.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28.0,-1.0,0.0,80525.0,19940101.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,47.0,-1.0,7.0,55113.0,19970101.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 3
### Splitting the Data

In [10]:
points = X[:, : -1]
targets = X[:, -1 :]

xTrain, xTest, yTrain, yTest = train_test_split(points, targets, test_size=0.2)

# Step 4
### Training the Model

In [11]:

try:
    model = load_model("myModel")
except:
    model = Sequential()
    model.add(Dense(30, input_dim=24, activation="relu"))
    model.add(Dense(50, activation="relu"))
    model.add(Dense(50, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(xTrain, yTrain, epochs=30, batch_size=10)

In [12]:
# save model
model.save("myModel")

INFO:tensorflow:Assets written to: myModel/assets


In [13]:
# predict
yPredict = model.predict(xTest)

accuracy_score(yTest, yPredict > 0.5)
# yPredict > 0.5

0.8282518768193657