# Downloading Rating Data

## Trudging through the MovieLens dataset

In [1]:
# Obtain the required files.

DirName <- "ml-1m"
Filename <- "ml-1m.zip"
if (!dir.exists(DirName)){
    Url <- "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
    Destination <- paste(DirName, Filename, sep="/")
    dir.create(DirName)
    download.file(url=Url, destfile=Destination, mode='wb')
    unzip(Destination)
    file.remove(Destination)
}

The three .dat files contain values separated by `::`, which causes problems in R. To read the files, you:

1. Read the individual lines as strings into R.
2. Substitute a tab `\t` for the `::`.
3. Create the resulting table using `fread`.

In [2]:
# Create three tables from the three files.
require(data.table)

ratings <- readLines(paste(DirName, "ratings.dat", sep="/"))
ratings <- gsub("::", "\t", ratings)
ratings <- fread(paste(ratings, collapse="\n"), sep="\t")
colnames(ratings) <- c("user_id", "movie_id", "rating", "timestamp")

users <- readLines(paste(DirName, "users.dat", sep="/"))
users <- gsub("::", "\t", users)
users <- fread(paste(users, collapse="\n"), sep="\t")
colnames(users) <- c("user_id", "gender", "age", "occupation", "zip")

movies <- readLines(paste(DirName, "movies.dat", sep="/"))
movies <- gsub("::", "\t", movies)
movies <- fread(paste(movies, collapse="\n"), sep="\t")
colnames(movies) <- c("movie_id", "title", "genres")

Loading required package: data.table



It's time to join the three tables together into a single table called `MovieLens`. This action requires the use of the `left_join()` function of the `dplyr` package. The order of the join is important in performing analysis later.

In [3]:
if (!require("dplyr")) {install.packages("dplyr")
                      library("dplyr")}

Loading required package: dplyr


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [4]:
MovieLens <- left_join(ratings, users, by = "user_id")
MovieLens <- left_join(MovieLens, movies, by = "movie_id")
print(MovieLens)

         user_id movie_id rating timestamp gender age occupation   zip
      1:       1     1193      5 978300760      F   1         10 48067
      2:       1      661      3 978302109      F   1         10 48067
      3:       1      914      3 978301968      F   1         10 48067
      4:       1     3408      4 978300275      F   1         10 48067
      5:       1     2355      5 978824291      F   1         10 48067
     ---                                                              
1000205:    6040     1091      1 956716541      M  25          6 11106
1000206:    6040     1094      5 956704887      M  25          6 11106
1000207:    6040      562      5 956704746      M  25          6 11106
1000208:    6040     1096      4 956715648      M  25          6 11106
1000209:    6040     1097      4 956715569      M  25          6 11106
                                          title                          genres
      1: One Flew Over the Cuckoo's Nest (1975)                     

In [5]:
# Obtain the number of ratings for each rating level.
Ordered <- MovieLens[order(title, rating), ]
print(count(Ordered, rating))

   rating      n
1:      1  56174
2:      2 107557
3:      3 261197
4:      4 348971
5:      5 226310


In [6]:
IndUsers <- MovieLens[order(movie_id, user_id), ]
print("Number of Reviews Per Movie")
print(count(IndUsers, title))

[1] "Number of Reviews Per Movie"
                                           title   n
   1:                         'burbs, The (1989) 303
   2:                       'Night Mother (1986)  70
   3:                  'Til There Was You (1997)  52
   4:                     $1,000,000 Duck (1971)  37
   5:              ...And Justice for All (1979) 199
  ---                                               
3702:                           Zachariah (1971)   2
3703:                Zed & Two Noughts, A (1985)  29
3704:                         Zero Effect (1998) 301
3705: Zero Kelvin (Kjærlighetens kjøtere) (1995)   2
3706:                    Zeus and Roxanne (1997)  23


In [7]:
SelMovie <- MovieLens[MovieLens$movie_id == 260]
Reviews <- count(SelMovie)
Average <- summarize(SelMovie, Mean = mean(rating, na.rm=TRUE))
sprintf("%i users gave an average rating of %f.", as.numeric(Reviews), as.numeric(Average))

## Navigating through anonymous web data

In [8]:
# Obtain the required file.

Filename <- "anonymous-msweb.data"
if (!file.exists(Filename)){
    Url <- "https://github.com/amirkrifa/ms-web-dataset/raw/master/anonymous-msweb.data"
    download.file(url=Url, destfile=Filename, mode='wb')
}

In [9]:
if (!require("sets")) {install.packages("sets")
                      library("sets")}

if (!require("hash")) {install.packages("hash")
                      library("hash")}

if (!require("stringr")) {install.packages("stringr")
                      library("stringr")}

Loading required package: sets

"package 'sets' was built under R version 4.0.3"

Attaching package: 'sets'


The following object is masked from 'package:dplyr':

    %>%


The following object is masked from 'package:data.table':

    set


Loading required package: hash

"package 'hash' was built under R version 4.0.3"
hash-2.2.6.1 provided by Decision Patterns



Attaching package: 'hash'


The following object is masked from 'package:data.table':

    copy


Loading required package: stringr


Attaching package: 'stringr'


The following object is masked from 'package:sets':

    %>%




In [10]:
# Open the file and read the data.
thisFile=file(Filename,open="r")
lines = readLines(thisFile)
sprintf("Total Number of Entries: %i", length(lines))

# Setup for attributes.
attributes <- NULL

# Setup for users
current_user_id <- 0
current_user_ids <- vector()
user_visits <- NULL

# Setup for vroots
page_visits <- NULL

# Close the file.
close(thisFile)

In [11]:
# Process the data one line at a time and place
# each record in the appropriate storage unit.
for (i in 1:length(lines)){
    chunks <- str_split(lines[i], ",")
    entry_type <- chunks[[1]][1]
    
    if (entry_type=="A") {
        id <- chunks[[1]][2]
        description <- chunks[[1]][4]
        url <- chunks[[1]][5]
        attribute <- tuple(id=id, description=description, url=url)
        attributes[[id]] <- attribute
    }
    
    if (entry_type=="C") {
        if (!current_user_id == 0) {
            user_visits[[current_user_id]] <- current_user_ids
            current_user_ids <= vector()
        }
        current_user_id = as.numeric(chunks[[1]][3])
    }
}

# Display the totals
sprintf("Total Number of Attributes: %i", length(attributes))
sprintf("Total Number of Users: %i", length(user_visits))

## Encountering the limits of rating data

### Massaging the data

In [12]:
reduced_movie <- MovieLens[MovieLens$rating >= 3]
reduced_movie <- 
    reduced_movie[,-c("movie_id", "timestamp", "genres", "gender", "age", "occupation", "zip")]
print(reduced_movie[1:5])
sprintf("Original Shape: %i X %i, New Shape: %i X %i", 
        nrow(MovieLens), ncol(MovieLens), nrow(reduced_movie), ncol(reduced_movie))

   user_id rating                                  title
1:       1      5 One Flew Over the Cuckoo's Nest (1975)
2:       1      3       James and the Giant Peach (1996)
3:       1      3                    My Fair Lady (1964)
4:       1      4                 Erin Brockovich (2000)
5:       1      5                   Bug's Life, A (1998)


In [13]:
counted_reviews <- group_by(reduced_movie, title) %>% count() %>% filter(n > 1000) %>% arrange(n)
print(counted_reviews)

[90m# A tibble: 161 x 2[39m
[90m# Groups:   title [161][39m
   title                             n
   [3m[90m<chr>[39m[23m                         [3m[90m<int>[39m[23m
[90m 1[39m Few Good Men, A (1992)         [4m1[24m003
[90m 2[39m My Cousin Vinny (1992)         [4m1[24m003
[90m 3[39m Boogie Nights (1997)           [4m1[24m004
[90m 4[39m Sneakers (1992)                [4m1[24m009
[90m 5[39m Witness (1985)                 [4m1[24m009
[90m 6[39m League of Their Own, A (1992)  [4m1[24m011
[90m 7[39m Good Morning, Vietnam (1987)   [4m1[24m014
[90m 8[39m Bull Durham (1988)             [4m1[24m017
[90m 9[39m Maltese Falcon, The (1941)     [4m1[24m020
[90m10[39m African Queen, The (1951)      [4m1[24m021
[90m# ... with 151 more rows[39m


### Performing collaborative filtering

In [14]:
user_rating <- dcast(reduced_movie, user_id ~ title, value.var = "rating")
print(user_rating)

      user_id $1,000,000 Duck (1971) 'Night Mother (1986)
   1:       1                     NA                   NA
   2:       2                     NA                   NA
   3:       3                     NA                   NA
   4:       4                     NA                   NA
   5:       5                     NA                   NA
  ---                                                    
6035:    6036                     NA                    3
6036:    6037                     NA                   NA
6037:    6038                     NA                   NA
6038:    6039                     NA                   NA
6039:    6040                     NA                   NA
      'Til There Was You (1997) 'burbs, The (1989)
   1:                        NA                 NA
   2:                        NA                 NA
   3:                        NA                 NA
   4:                        NA                 NA
   5:                        NA                 N

In [15]:
YF_ratings <- user_rating[,'Young Frankenstein (1974)']
print(YF_ratings)

      Young Frankenstein (1974)
   1:                        NA
   2:                        NA
   3:                        NA
   4:                        NA
   5:                        NA
  ---                          
6035:                         4
6036:                        NA
6037:                        NA
6038:                        NA
6039:                         4


In [16]:
correlations <- cor(user_rating, YF_ratings, use="pairwise.complete.obs")
valid <- !is.na(correlations) & correlations > 0.8

"la deviazione standard è zero"


In [17]:
data.frame("film" = rownames(correlations)[valid], 
           "corr" = correlations[valid])

film,corr
<chr>,<dbl>
"Acid House, The (1998)",0.8164966
All the Vermeers in New York (1990),0.8703883
Babyfever (1994),1.0
"Ballad of Ramblin' Jack, The (2000)",1.0
"Battle of the Sexes, The (1959)",1.0
"Best Man, The (Il Testimone dello sposo) (1997)",0.8660254
"Big Green, The (1995)",1.0
Boys and Girls (2000),0.8072074
Catwalk (1995),1.0
Children of the Corn III (1994),1.0


# Integrating Text and Behaviors

## Viewing the attributes

In [18]:
for (item in attributes){
    print(sprintf("%4s, %30s, %12s",item$id, item$description, item$url))
}

[1] "1287,      \"International AutoRoute\", \"/autoroute\""
[1] "1288,                      \"library\",   \"/library\""
[1] "1289, \"Master Chef Product Information\", \"/masterchef\""
[1] "1297,              \"Central America\",  \"/centroam\""
[1] "1215,     \"For Developers Only Info\", \"/developer\""
[1] "1279,              \"Multimedia Golf\",    \"/msgolf\""
[1] "1239,         \"Microsoft Consulting\", \"/msconsult\""
[1] "1282,                         \"home\",      \"/home\""
[1] "1251,            \"Reference Support\", \"/referencesupport\""
[1] "1121,           \"Microsoft Magazine\",  \"/magazine\""
[1] "1083,            \"MS Access Support\", \"/msaccesssupport\""
[1] "1145,       \"Visual Fox Pro Support\", \"/vfoxprosupport\""
[1] "1276,          \"Visual Test Support\", \"/vtestsupport\""
[1] "1200,               \"Benelux Region\",   \"/benelux\""
[1] "1259,                     \"controls\",  \"/controls\""
[1] "1155,                     \"Sidewalk\",  \"/sidewalk\""

# Leveraging SVD

## Seeing SVD in action

In [19]:
ratings_mtx_df <- dcast(MovieLens, user_id ~ title, value.var = "rating", fill=0)
movie_index <- colnames(ratings_mtx_df)[2:ncol(ratings_mtx_df)]

In [20]:
if (!require("svd")) {install.packages("svd")
                      library("svd")}

R <- svd(as.matrix(ratings_mtx_df[,2:ncol(ratings_mtx_df)]), nu=15, nv=15)

Loading required package: svd

"package 'svd' was built under R version 4.0.3"


In [21]:
movie <- 'Star Wars: Episode V - The Empire Strikes Back (1980)'
movie_idx <- match(c(movie), as.list(movie_index))

In [22]:
correlation_mtx <- cor(t(R$v))

In [24]:
movie_index[order(correlation_mtx[ ,movie_idx], decreasing=TRUE)[2:11]]