In [1]:
# %matplotlib notebook

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from xgboost import XGBRegressor

import os
import pickle
import wordcloud as wc
import scipy.sparse as sparse
import xgboost as xgb
import random
import jenkspy
import statsmodels.tools.tools as stattools
from itertools import combinations

import sklearn
from sklearn import tree
from sklearn.model_selection import KFold,train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [4]:
Dataset_path = "./DataSets/"
ml_path = Dataset_path + "ml-latest/"

ratings = pd.read_csv( ml_path + "ratings.csv")
movies = pd.read_csv( ml_path + "movies.csv")

In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


### just keep rating year between 1995, 2001

In [50]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [52]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


### just keep rating upper than median of user ratings

In [54]:
ratingGroupByUserId = ratings.groupby(['userId'])

# ratingThreshHold = ratingGroupByUserId.apply(lambda grp: grp.rating.median() )  #=> MemoryError: Unable to allocate 98.2 GiB for an array with shape (283228, 46554) and data type float64
ratingThreshHold = ratingGroupByUserId.apply(lambda grp: np.percentile(grp.rating, 75) )

ratingThreshHold = ratingThreshHold.to_frame().reset_index()

tmpp = pd.merge( ratings, ratingThreshHold , how='inner' )

CleanedRatings = tmpp[ tmpp['rating']>=tmpp[0] ].reset_index()[ ['userId', 'movieId', 'rating', 'timestamp']]

In [56]:
print( f"number of Ratings : { CleanedRatings.shape[0] }")
print( f"number of movies : { CleanedRatings.groupby('movieId').count().shape[0] }")
print( f"number of users : { CleanedRatings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( CleanedRatings.groupby('rating').count().index )}, {np.max( CleanedRatings.groupby('rating').count().index )})  ")

number of Ratings : 3363219
number of movies : 4839
number of users : 103827
range of rating : ( 1.0, 5.0)  


### just keep rating that user and movie are upper than threshhold

In [63]:
min_movie_ratings = 20
filter_Movies = CleanedRatings['movieId'].value_counts() > min_movie_ratings
filter_Movies = filter_Movies[filter_Movies].index.tolist()

min_user_ratings = 20
filter_users = CleanedRatings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = CleanedRatings[(CleanedRatings['movieId'].isin(filter_Movies)) & (CleanedRatings['userId'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(CleanedRatings.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(3363219, 4)
The new data frame shape:	(2829348, 4)


In [100]:
colName = df_new.movieId.unique()
colCount = len( df_new.movieId.unique() )
colCount

3654

In [103]:
rowName = df_new.userId.unique()
rowCount = len( df_new.userId.unique() )
rowCount

38960

In [110]:
# del df

In [111]:
df = pd.DataFrame(np.zeros((rowCount, colCount)),index=rowName , columns=colName)

In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38960 entries, 6 to 283224
Columns: 3654 entries, 10 to 1316
dtypes: float64(3654)
memory usage: 1.1 GB


In [113]:
df

Unnamed: 0,10,21,32,47,50,111,141,153,161,163,...,2079,3140,2923,1098,2650,1901,1531,128,873,1316
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
for i in range(len(df_new)):
    df.loc[ df_new.iloc[i].userId ,  df_new.iloc[i].movieId ] = 1
df

Unnamed: 0,10,21,32,47,50,111,141,153,161,163,...,2079,3140,2923,1098,2650,1901,1531,128,873,1316
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283168,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
df.to_csv(Dataset_path +'AssociationRules.csv', index=False)
del df

In [5]:
df = pd.read_csv( Dataset_path + "AssociationRules.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38960 entries, 0 to 38959
Columns: 3654 entries, 10 to 1316
dtypes: float64(3654)
memory usage: 1.1 GB


In [47]:
columnsdf = pd.DataFrame( df.columns, columns=['movieId'] )

columnsdf.movieId = columnsdf.movieId.astype('int64')

df.columns = pd.merge( cccc, movies , how='inner' ).title.values

In [49]:
df

Unnamed: 0,GoldenEye (1995),Get Shorty (1995),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Seven (a.k.a. Se7en) (1995),"Usual Suspects, The (1995)",Taxi Driver (1976),"Birdcage, The (1996)",Batman Forever (1995),Crimson Tide (1995),Desperado (1995),...,Kidnapped (1960),Three Ages (1923),Handle with Care (a.k.a. Citizen's Band) (1977),"Search for One-eye Jimmy, The (1996)","Ghost of Frankenstein, The (1942)",Dear Jesse (1997),Losing Chase (1996),Jupiter's Wife (1994),Shadow of Angels (Schatten der Engel) (1976),Anna (1996)
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38955,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

In [None]:
frq_items_apriori = apriori(df, min_support = 0.1, use_colnames = True)
frq_items_apriori.sort_values(by=['support'], inplace=True)
frq_items_apriori

In [None]:
frq_items_fpgrowth = fpgrowth(df, min_support = 0.1, use_colnames = True)
frq_items_fpgrowth.sort_values(by=['support'], inplace=True)
frq_items_fpgrowth

In [None]:
rules = association_rules(frq_items_apriori, metric ="lift", min_threshold = 1)