# MovieLens database cleaning

## Extract needed dates

According to its ReadMe, this database has records from ratings dating from between January 09, 1995 and March 31, 2015. The Netflix database only ranges from October 1998 to December 2005, so that many entries are useless.

The numbers of distinct users goes from 138.493 to 52875.

In [19]:
import pandas as pd
import numpy as np

In [20]:
db = pd.read_csv("data/MovieLens/ratings.csv", header=None, encoding="UTF-8", names=["userId","movieId","rating","timestamp"])
print(db.shape)
a = (db["timestamp"] < 907200000) 
print("There are", sum(a), "reviews made before the 1st October 1998.")
b = (db["timestamp"] > 1135987200) 
print("There are", sum(b), "reviews made after the 31st December 2005.")

db.head()

(1048575, 4)
There are 137440 reviews made before the 1st October 1998.
There are 415433 reviews made after the 31st December 2005.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [21]:
# Remove useless rows
a = (db["timestamp"] > 907200000) 
db = db[a]
print(db.shape)
b = (db["timestamp"] < 1135987200) 
db = db[b]
print(db.shape)

(911135, 4)
(495702, 4)


In [4]:
# Get number of distincts users
print("There are", len(db.userId.unique()), "users left.")

There are 2726 users left.


## Remove not matching movies



In [5]:
# Import movie titles

import csv

nf_movies = [];
with open('data/Netflix/movie_titles.csv', encoding="ISO-8859-1") as nf:
    for col1,col2,*col3 in csv.reader(nf):
        #s = ''.join(col3).lower().replace(" ","")
        #s = s[:(s.find('(') if s.find('(') !=-1 else len(s))]
        nf_movies.append((''.join(col3)+"("+col2+")").lower().replace(" ",""))

ml_movies = [];
with open('data/MovieLens/movies.csv', encoding="UTF-8") as ml:
    for col1,col2,*col3 in csv.reader(ml):
        s = ''.join(col2).lower().replace(" ","")
        loc = s.find(",the(")
        if loc !=-1:
            s = "the" + s[:loc] + s[loc+4:]
        else:
            loc=s.find(",a(")
            if loc != -1:
                s = "a" + s[:loc] + s[loc+2:] 
            else:
                loc=s.find(",an(")
                if loc != -1:
                    s = "an" + s[:loc] + s[loc+3:] 
        ml_movies.append((col1, s))
        
print(len(nf_movies))
print(len(ml_movies))

17770
27278


In [22]:
matches = {}
#revmatches ={}
count = 0
for movie in range(len(nf_movies)):
    #if any(nf_movies[movie] in q for s,q in ml_movies):
    for label, title in ml_movies:
        if nf_movies[movie] == title and nf_movies[movie] not in ["pinocchio(2002)", "lastmanstanding(1996)", "emma(1996)","hamlet(2000)", "hamlet(1990)", "menwithguns(1997)"]:
#            if label in matches.values():
#                print("FUCKED BY:", nf_movies[movie], "and", title, nf_movies[revmatches[label]-1])
#                count +=1
            matches[movie + 1 ] = label
#            revmatches[label] = movie +1
            break
        #matches.setdefault((int(s) for s,q in ml_movies if nf_movies[movie] in q), movie+1)

        
print("I found", len(matches), "matches.")
print(count)

I found 6133 matches.
0


In [23]:
# Remove movies
idx2drop =[]
for ii,jj in enumerate(db['movieId']):
    if str(jj) not in matches.values():
        idx2drop.append(ii)
db.drop(db.index[idx2drop], inplace=True)
# for key in matches:
# db[db.movieId in matches.keys()]

In [24]:
print(db.shape)
print(len(idx2drop))
print(idx2drop[1:20])

(382108, 4)
113594
[2, 3, 5, 8, 9, 10, 12, 14, 16, 19, 27, 30, 31, 32, 33, 35, 43, 46, 49]


In [9]:
z = []
#df = pd.DataFrame(columns=["userId","movieId","rating","timestamp"])
for i in matches:
    #print(i)
    tmp = pd.read_csv("data/Netflix/training_set/mv_" + format(i,'07d')+".txt", header=None,
                         names=["userId","rating","timestamp"],encoding="ISO-8859-1")
    tmp["movieId"] = matches[i]
    z.append(tmp)

In [10]:
df = pd.concat(z,copy=False)

In [11]:
df.to_csv("DF.csv",";")

In [25]:
db.to_csv("DB.csv",";")