# Cleaning the data

In [3]:
import pandas as pd
import numpy as np

##remove all the white and blank spaces
ratings_list = [i.strip().split("::")for i in open('G://Numerical Analysis//class 6//ml-1m//ratings.dat','r').readlines()]
movies_list = [i.strip().split("::")for i in open('G://Numerical Analysis//class 6//ml-1m//movies.dat','r').readlines()]
users_list = [i.strip().split("::")for i in open('G://Numerical Analysis//class 6//ml-1m//users.dat','r').readlines()]

In [4]:
ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)

In [211]:
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
##ratings_df = ratings_df.dropna(subset=['Timestamp'])
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)## convert the argument to numeric

In [214]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# Framing the big sparse matrix

In [217]:
## A matrix
A_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
A_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
## Normalizing the matrix A
A = A_df.as_matrix()
user_ratings_mean = np.mean(A, axis = 1)
A_demeaned = A - user_ratings_mean.reshape(-1, 1)
#A_demeaned

# With my ratings matrix properly formatted and normalized, I'm ready to do the singular value decomposition

In [119]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(A_demeaned, k = 50)

In [187]:
sigma = np.diag(sigma)

In [188]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)


In [218]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = A_df.columns)
preds_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


# Building recommendation

In [196]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations):
    
    # Get and sort the user's predictions
    # UserID starts at 1, not 0
    user_row_number = userID - 1
    # UserID starts at 1
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    ## iloc-integer-location based indexing for selection by position.
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False))

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[movies_df['MovieID'].isin(user_full['MovieID'])].
                      merge(pd.DataFrame(sorted_user_predictions).
                      reset_index(), how = 'left',left_on = 'MovieID',right_on = 'MovieID').
                      rename(columns = {user_row_number: 'movie_recommended'}).
                      sort_values('movie_recommended', ascending = False).iloc[:num_recommendations, :-1])
    
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))


# Testing

In [219]:
movies_already_rated,movies_recommended = recommend_movies(preds_df,11, movies_df, ratings_df,10)

User 11 has already rated 137 movies.
Recommending highest 10 predicted ratings movies not already rated.


TypeError: 'NoneType' object is not iterable

# Results

In [220]:
movies_already_rated.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
34,11,2804,5,978902902,"Christmas Story, A (1983)",Comedy|Drama
15,11,1197,5,978903297,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
27,11,2580,5,978220030,Go (1999),Crime
26,11,1923,5,978220393,There's Something About Mary (1998),Comedy
89,11,2795,5,978903701,Vacation (1983),Comedy


In [222]:
movies_recommended

Unnamed: 0,MovieID,Title,Genres
33,608,Fargo (1996),Crime|Drama|Thriller
31,593,"Silence of the Lambs, The (1991)",Drama|Thriller
110,2858,American Beauty (1999),Comedy|Drama
12,318,"Shawshank Redemption, The (1994)",Drama
104,2762,"Sixth Sense, The (1999)",Thriller
48,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
53,1265,Groundhog Day (1993),Comedy|Romance
115,2997,Being John Malkovich (1999),Comedy
57,1358,Sling Blade (1996),Drama|Thriller
77,1923,There's Something About Mary (1998),Comedy
