# Downloading Rating Data

## Trudging through the MovieLens dataset

In [1]:
import urllib.request
import os.path
import zipfile

DirName = "ml-1m"
Filename = "ml-1m.zip"
if not os.path.exists(DirName):
    url = "http://files.grouplens.org/datasets/" \
          "movielens/"
    # This step may take a few minutes to download
    # the data.
    urllib.request.urlretrieve(url + Filename, 
                               Filename)
    
    # This step will extract the data for you and
    # place it in the correct directory.
    with zipfile.ZipFile(Filename, 'r') as zip_ref:
        zip_ref.extractall()

In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
users = pd.read_table('ml-1m/users.dat', sep='::', 
        header=None, names=['user_id', 'gender', 
        'age', 'occupation', 'zip'], engine='python')
ratings = pd.read_table('ml-1m/ratings.dat', 
          sep='::', header=None, 
          names=['user_id', 'movie_id', 
          'rating', 'timestamp'], engine='python')
movies = pd.read_table('ml-1m/movies.dat', sep='::', 
         header=None, names=['movie_id', 'title', 
         'genres'], engine='python')
MovieLens = pd.merge(pd.merge(ratings, users), 
                     movies)

In [3]:
print(MovieLens)

         user_id  movie_id  rating  timestamp gender  age  occupation    zip  \
0              1      1193       5  978300760      F    1          10  48067   
1              2      1193       5  978298413      M   56          16  70072   
2             12      1193       4  978220179      M   25          12  32793   
3             15      1193       4  978199279      M   25           7  22903   
4             17      1193       5  978158471      M   50           1  95350   
...          ...       ...     ...        ...    ...  ...         ...    ...   
1000204     5949      2198       5  958846401      M   18          17  47901   
1000205     5675      2703       3  976029116      M   35          14  30030   
1000206     5780      2845       1  958153068      M   18          17  92886   
1000207     5851      3607       5  957756608      F   18          20  55410   
1000208     5938      2909       4  957273353      M   25           1  35401   

                                       

In [4]:
MovieLens.set_index(["title", "rating"]).count(
    level="rating")["user_id"]

rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: user_id, dtype: int64

In [5]:
IndUsers = MovieLens.set_index(
    ["movie_id", "user_id"]).count(
    level="user_id")["title"]
print("Average movie reviews per user: ", 
      IndUsers.mean())

IndMovies = MovieLens.set_index(
    ["user_id", "title"]).count(
    level="title")["movie_id"]
print("\nNumber of Reviews Per Movie\n")
print(IndMovies)

Average movie reviews per user:  165.5975165562914

Number of Reviews Per Movie

title
$1,000,000 Duck (1971)                         37
'Night Mother (1986)                           70
'Til There Was You (1997)                      52
'burbs, The (1989)                            303
...And Justice for All (1979)                 199
                                             ... 
Zed & Two Noughts, A (1985)                    29
Zero Effect (1998)                            301
Zero Kelvin (Kjærlighetens kjøtere) (1995)      2
Zeus and Roxanne (1997)                        23
eXistenZ (1999)                               410
Name: movie_id, Length: 3706, dtype: int64


In [6]:
SelMovie = MovieLens[
    MovieLens["movie_id"]==260]
print(SelMovie.size, 
      " users gave an average rating of ",
      SelMovie["rating"].mean())

29910  users gave an average rating of  4.453694416583082


## Navigating through anonymous web data

In [7]:
import urllib.request
import os.path

filename = "anonymous-msweb.data"
if not os.path.exists("anonymous-msweb.data"):
    url = "https://github.com/amirkrifa/ms-web-dataset/\
raw/master/anonymous-msweb.data"
    urllib.request.urlretrieve(url, filename)

In [8]:
import codecs
import collections

# Open the file.
file = codecs.open(filename, 'r')

# Setup for attributes.
attribute = collections.namedtuple(
    'page', ['id', 'description', 'url'])
attributes = {}

# Setup for users
current_user_id = None
current_user_ids = []
user_visits = {}

# Setup for vroots
page_visits = {}

# Setup for vroots
page_visits = {}

# Process the data one line at a time and place
# each record in the appropriate storage unit.
for line in file:
    chunks = line.split(',')
    entry_type = chunks[0]
    
    if entry_type == 'A':
        type, id, ignored, description, url = chunks
        attributes[int(id)] = attribute(
            id=int(id), 
            description=description, url=url)
        
    if entry_type == 'C':
        if not current_user_id == None:
            user_visits[current_user_id] = set(
                current_user_ids)
            current_user_ids = []
        current_user_id = int(chunks[2])
        
    if entry_type == 'V':
        page_id = int(chunks[1])
        current_user_ids.append(page_id)
        page_visits.setdefault(page_id, [])
        page_visits[page_id].append(current_user_id)
        
# Display the totals
print('Total Number of Attributes: ', 
      len(attributes.keys()))
print('Total Number of Users: ', 
      len(user_visits.keys()))
print('Total Number of VRoots: ', 
      len(page_visits.keys()))

Total Number of Attributes:  294
Total Number of Users:  32710
Total Number of VRoots:  285


## Encountering the limits of rating data

### Massaging the data

In [9]:
reduced_movie = MovieLens[
    MovieLens["rating"]>= 3.0]
reduced_movie = reduced_movie.drop(
    columns=['movie_id','timestamp', 'genres',
             'gender', 'age', 'occupation', 'zip'])

print(reduced_movie.head())
print()
print("Original Shape: {0}, New Shape: {1}".format(
    MovieLens.shape, reduced_movie.shape))

   user_id  rating                                   title
0        1       5  One Flew Over the Cuckoo's Nest (1975)
1        2       5  One Flew Over the Cuckoo's Nest (1975)
2       12       4  One Flew Over the Cuckoo's Nest (1975)
3       15       4  One Flew Over the Cuckoo's Nest (1975)
4       17       5  One Flew Over the Cuckoo's Nest (1975)

Original Shape: (1000209, 10), New Shape: (836478, 3)


In [10]:
reduced_movie = reduced_movie[
    reduced_movie.groupby('title')['rating'].transform(
        'size') > 1000]

print(reduced_movie.groupby('title')[
    'rating'].count().sort_values().head())
print()
print("New shape: ", reduced_movie.shape)

title
Few Good Men, A (1992)    1003
My Cousin Vinny (1992)    1003
Boogie Nights (1997)      1004
Witness (1985)            1009
Sneakers (1992)           1009
Name: rating, dtype: int64

New shape:  (237212, 3)


In [11]:
ratings = None
movies = None
movie_data = None

### Performing collaborative filtering

In [12]:
user_rating = pd.pivot_table(
    reduced_movie, 
    index='user_id', 
    columns='title', 
    values='rating')

print(user_rating.head())

title    2001: A Space Odyssey (1968)  Abyss, The (1989)  \
user_id                                                    
1                                 NaN                NaN   
2                                 NaN                NaN   
3                                 NaN                NaN   
4                                 NaN                NaN   
5                                 NaN                NaN   

title    African Queen, The (1951)  Airplane! (1980)  Aladdin (1992)  \
user_id                                                                
1                              NaN               4.0             4.0   
2                              NaN               NaN             NaN   
3                              NaN               NaN             NaN   
4                              NaN               NaN             NaN   
5                              NaN               NaN             NaN   

title    Alien (1979)  Aliens (1986)  Amadeus (1984)  American Beauty (199

In [13]:
YF_ratings = user_rating['Young Frankenstein (1974)']
print(YF_ratings.sort_values(ascending=False).head())

user_id
3259    5.0
2695    5.0
2774    5.0
2766    5.0
2757    5.0
Name: Young Frankenstein (1974), dtype: float64


In [14]:
print(user_rating.corrwith(
    YF_ratings).sort_values(
    ascending=False).head())

title
Young Frankenstein (1974)                       1.000000
Blazing Saddles (1974)                          0.412395
Alien (1979)                                    0.297567
Willy Wonka and the Chocolate Factory (1971)    0.272574
M*A*S*H (1970)                                  0.259304
dtype: float64


# Integrating Text and Behaviors

## Viewing the attributes

In [15]:
for k, v in attributes.items():
    print("{:4} {:30.30} {:12}".format(
        v.id, v.description, v.url))

1287 "International AutoRoute"      "/autoroute"

1288 "library"                      "/library"
 
1289 "Master Chef Product Informati "/masterchef"

1297 "Central America"              "/centroam"

1215 "For Developers Only Info"     "/developer"

1279 "Multimedia Golf"              "/msgolf"
  
1239 "Microsoft Consulting"         "/msconsult"

1282 "home"                         "/home"
    
1251 "Reference Support"            "/referencesupport"

1121 "Microsoft Magazine"           "/magazine"

1083 "MS Access Support"            "/msaccesssupport"

1145 "Visual Fox Pro Support"       "/vfoxprosupport"

1276 "Visual Test Support"          "/vtestsupport"

1200 "Benelux Region"               "/benelux"
 
1259 "controls"                     "/controls"

1155 "Sidewalk"                     "/sidewalk"

1092 "Visual FoxPro"                "/vfoxpro"
 
1004 "Microsoft.com Search"         "/search"
  
1057 "MS PowerPoint News"           "/powerpoint"

1140 "Netherlands (Holland)"        "

## Obtaining statistics

In [16]:
nbr_visits = list(map(len, user_visits.values()))
average_visits = sum(nbr_visits) / len(nbr_visits)
one_visit = sum(x == 1 for x in nbr_visits)

print("Number of user visits: ", sum(nbr_visits))
print("Average number of visits: ", average_visits)
print("Users with just one visit: ", one_visit)

Number of user visits:  98653
Average number of visits:  3.0159889941913787
Users with just one visit:  9994


# Leveraging SVD

## Seeing SVD in action

In [17]:
ratings_mtx_df = MovieLens.pivot_table(values='rating', 
        index='user_id', columns='title', fill_value=0)
movie_index = ratings_mtx_df.columns

In [18]:
from sklearn.decomposition import TruncatedSVD
recom = TruncatedSVD(n_components=15, random_state=101)
R = recom.fit_transform(ratings_mtx_df.values.T)

In [19]:
movie = 'Star Wars: Episode V \
- The Empire Strikes Back (1980)'
movie_idx = list(movie_index).index(movie)
print("movie index: %i" %movie_idx)
print(R[movie_idx])

movie index: 3154
[184.72254552 -17.77612872  47.33450866  51.4664494   47.92058216
  17.65033116  14.3574635  -12.82219207  17.51347857   5.46888807
   7.5430805   -0.57117869 -30.74032355   2.4088565  -22.50368497]


In [20]:
import numpy as np
correlation_matrix = np.corrcoef(R)
P = correlation_matrix[movie_idx]
print(list(movie_index[(P > 0.985) & (P < 1.0)]))

['Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)']
