### In this demo we will train a xgboost model and will use it as a cascade model

In [None]:
import xgboost as xgb
import pandas as pd
import memsql
import numpy as np

### First of all, read reviews data from a file

In [None]:
movie_reviews = pd.read_csv("../movie.reviews", sep = "\t", names = ["user_id","movie_id","rating","timestamp"])
movie_reviews.drop(['timestamp'],axis=1, inplace=True)
movie_reviews.head()

### Reading movies data from a file

In [None]:
all_movies = pd.read_csv("../movie.data", sep = "\|", names = ["movie_id", "movie_title", "release_date", "video_release_date",
              "IMDb_URL", "unknown", "Action", "Adventure", "Animation",
              "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
              "Thriller", "War", "Western"])
movie_features = all_movies.drop(['IMDb_URL', 'release_date', 'video_release_date', 'movie_title'],axis=1)
movie_features.head()

### And also user data

In [None]:
users = pd.read_csv("../user.data", sep = "\|", names = ["user_id", "age", "gender", "occupation", "zip_code"])
users.head()

### Here we're going to make one hot encoding of categorical features

In [None]:
users_occupation = pd.concat([users,pd.get_dummies(users['occupation'], prefix='occupation')],axis=1)
users_occupation.drop(['occupation'],axis=1, inplace=True)

users_gender = pd.concat([users_occupation,pd.get_dummies(users_occupation['gender'], prefix='gender')],axis=1)
users_gender.drop(['gender'],axis=1, inplace=True)
users_gender.drop(['zip_code'],axis=1, inplace=True)

users_gender.head()

### Add average rating as a feature

In [None]:
average_rating = movie_reviews.groupby('movie_id')['movie_id', 'rating'].mean()
average_rating.columns = ['movie_id', 'average_rating']

### Let's join movie_features with movie_reviews and add an average rating as a feature

In [None]:
movie_reviews_features = movie_reviews.join(movie_features.set_index('movie_id'), on='movie_id')
movie_reviews_features = movie_reviews_features.join(average_rating.set_index('movie_id'), on='movie_id')
all_data = movie_reviews_features.join(users_gender.set_index('user_id'), on='user_id')
feature_data = all_data.drop(['user_id', 'movie_id'],axis=1)
feature_data.head()

### Splitting data as 80% train set, 20% as test set.

In [None]:
train_data=feature_data.sample(frac=0.8, random_state=200) #random state is a seed value
test_data=feature_data.drop(train_data.index)

In [None]:
print("number of rows in train set:", len(train_data.index))
print("number of rows in  test set:", len( test_data.index))

### Converting from pandas dataframe to NumPy matrix for the xgboost

In [None]:
X_train = train_data.drop(['rating'], axis=1).to_numpy()
y_train = train_data['rating'].to_numpy()
X_test = test_data.drop(['rating'], axis=1).to_numpy()
y_test = test_data['rating'].to_numpy()

### Train the model

In [None]:
rgr = xgb.XGBRegressor()
rgr.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='rmse', early_stopping_rounds=10, verbose=0);

### Lets inspect the model
We inspect here only the last tree that was built.

In [None]:
booster = rgr.get_booster()
booster.feature_names = list(train_data.drop(['rating'], axis=1).columns)
print(booster.get_dump()[-1])

### Connect to memsql (look at README for better understanding how to create MemSQL host)

In [None]:
from memsql.common import database
memsql_host="<input_your_host_here>"
memsql_port=3306
memsql_user="root"
memsql_password=""
memsql_conn = database.connect(
    host=memsql_host, port=memsql_port, 
    user=memsql_user, password=memsql_password)

memsql_conn.query('CREATE DATABASE IF NOT EXISTS testsm')
memsql_conn.query('USE testsm');

### Deploy model

In [None]:
import lib.memsql_udf as udf_tool
udf_tool.upload_xgb_to_memsql(booster, booster.feature_names, memsql_conn, udf_tool.F.SUM)

In [None]:
rows = memsql_conn.query("SHOW FUNCTIONS")
functions_df = pd.DataFrame([dict(r) for r in rows])
functions_df.head()

### LOAD data into the table

In [None]:
import lib.memsql_csv as csv_tool
memsql_conn.query("DROP TABLE IF EXISTS movie_rating")
csv_tool.load_csv_to_table("../movie_ratings.csv", "movie_rating", ["id"] + list(all_data.columns), memsql_conn)

## Select user 

In [None]:
user_id = 10

### Prepare features

In [None]:
pre_apply_features=['Adventure', 'Animation, Comedy, Documentary, Drama, Mystery, Romance, age, average_rating, gender_F, occupation_artist, occupation_executive, occupation_healthcare, occupation_homemaker, occupation_librarian, occupation_writer']
pre_apply_args = ', '.join(pre_apply_features)
args = ', '.join(f"`{f}`" for f in booster.feature_names)

## Select best movies for predict

In [None]:
res = memsql_conn.query(f"SELECT DISTINCT movie_id, apply_trees({args}) AS res FROM movie_rating WHERE movie_id NOT IN (SELECT movie_id FROM movie_rating WHERE user_id = {user_id}) AND eval_tree_0({pre_apply_args}) > 1 ORDER BY res DESC LIMIT 10")
top_movies = DataFrame(res.rows)
top_movies.columns = res.fieldnames
top_movies.head()

## Select best films from best fit

In [None]:
top_movies_features = movie_features[movie_features['movie_id'].isin(top_movies['movie_id'])]
top_movies_features['user_id'] = user_id
current_user = users_gender[users_gender['user_id'] == user_id]
top_movies_features = pd.merge(top_movies_features, current_user, how='left', on=['user_id'])
top_movies_features

### We got here best possible movies for cuurent user based on the model prediction

In [None]:
top_movies_features_average_rating = top_movies_features.join(average_rating.set_index('movie_id'), on='movie_id')
top_movies_f = top_movies_features_average_rating.drop(['user_id', 'movie_id'],axis=1)
best_movies_predict = booster.predict(xgb.DMatrix(top_movies_f, feature_names=booster.feature_names))
top_movies_features_average_rating['predict'] = best_movies_predict
top_movies = top_movies_features_average_rating.sort_values(by=['predict'], ascending=False)
top_labels = pd.merge(top_movies, all_movies[['movie_title', 'movie_id']], how='left', on=['movie_id'])
top_labels[['movie_title', 'predict', 'average_rating']]