# Collaborative Filtering

Collaborative Filtering simply put uses the "wisdom of the crowd" to recommend items. Item based collaborative filtering uses the patterns of users who liked the same movie as me to recommend me a movie (users who liked the movie that I like, also liked these other movies).  Recommendation based on user's input of any movie present in the dataset is done. 


In [None]:
!mkdir -p data
!wget -O data/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o data/ml-100k.zip -d data


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
#plt.style.use('dark_background')

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

%matplotlib inline

from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf8"))


In [None]:
import copy
import random
import numpy as np

def shuffle(data):
    ret = copy.deepcopy(data)
    random.shuffle(ret)
    
    return ret

def data_split(data, ratio=0.2):
    tmp = copy.deepcopy(data)
    len_up = int(round(len(tmp) * ratio, 0))
    len_down = len(tmp) - len_up

    idx = np.random.randint(len(tmp), size=len_up)
    data_up = tmp[idx,:]
    tmp = np.delete(tmp, idx, 0)

    data_down = tmp
    
    return data_up, data_down
    

In [None]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1', parse_dates=True) 

users.head()

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

ratings.head()

In [None]:
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

movies.head()

In [None]:
movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings, users)

df.head()

# Data Pre-Processing

In [None]:
df.drop(df.columns[[3,4,7]], axis=1, inplace=True)
ratings.drop( "unix_timestamp", inplace = True, axis = 1 ) 
movies.drop(movies.columns[[3,4]], inplace = True, axis = 1 )
#Dropping all the columns that are not really needed
df.info()

In [None]:
df.head()

# Movie Ratings

In [None]:
movie_stats = df.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats.head()


Setting a threshold of atleast 50 ratings for better analysis.

In [None]:
min_50 = movie_stats['rating']['size'] >= 50
movie_stats[min_50].sort_values([('rating', 'mean')], ascending=False).head()


Much better. Mean ratings now look reliable. 

In [None]:
ratings.rating.plot.hist(bins=50)
plt.title("Distribution of Users' Ratings")
plt.ylabel('Number of Ratings')
plt.xlabel('Rating (Out of 5)');



# Raters' Age Distribution

In [None]:
users.age.plot.hist(bins=25)
plt.title("Distribution of Users' Ages")
plt.ylabel('Number of Users')
plt.xlabel('Age');


# Pivot Table

In [None]:
test, train = data_split(ratings.values, 0.1)
print(train.shape, test.shape)

In [None]:
df = pd.DataFrame(train, columns=['user_id', 'movie_id', 'rating'])
ratings_matrix = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating').reset_index(drop=True)
ratings_matrix.fillna( 0, inplace = True )
ratings_matrix.head()

In [None]:
ratings_matrix.shape

# Cosine Similarity

In [None]:
movie_similarity = 1 - pairwise_distances( ratings_matrix.values, metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
df_movie_similarity = pd.DataFrame( movie_similarity )
df_movie_similarity.head(5)

In [None]:
df_movie_similarity.shape

# Recommender Engine

In [None]:
try:
    #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
    user_inp="Speed (1994)"
    inp=movies[movies['title']==user_inp].index.tolist()
    inp=inp[0]
    
except:
    print("Sorry, the movie is not in the database!")
    
movies['similarity'] = df_movie_similarity.iloc[inp]
movies.columns = ['movie_id', 'title', 'release_date','similarity']
movies.head()


In [None]:
print("Recommended movies based on your choice of ",user_inp ,": \n", movies.sort_values( ["similarity"], ascending = False )[1:10])

*This is my first try at recommender engines. Comments/suggestions are appreciated.*