# MovieLens database cleaning

## Extract needed dates

According to its ReadMe, this database has records from ratings dating from between January 09, 1995 and March 31, 2015. The Netflix database only ranges from October 1998 to December 2005, so that many entries are useless.

The numbers of distinct users goes from 138.493 to 52875.

In [1]:
import pandas as pd
import numpy as np

In [2]:
db = pd.read_csv("data/MovieLens/ratings.csv", header=None, names=["userId","movieId","rating","timestamp"])
print(db.shape)
a = (db["timestamp"] < 907200000) 
print("There are", sum(a), "reviews made before the 1st October 1998.")
b = (db["timestamp"] > 1135987200) 
print("There are", sum(b), "reviews made after the 31st December 2005.")

db.head()

(1048575, 4)
There are 137440 reviews made before the 1st October 1998.
There are 415433 reviews made after the 31st December 2005.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
# Remove useless rows
a = (db["timestamp"] > 907200000) 
db = db[a]
print(db.shape)
b = (db["timestamp"] < 1135987200) 
db = db[b]
print(db.shape)

(911135, 4)
(495702, 4)


In [4]:
# Get number of distincts users
print("There are", len(db.userId.unique()), "users left.")

There are 2726 users left.


## Remove not matching movies



In [5]:
# Import movie titles

import csv

nf_movies = [];
with open('data/Netflix/movie_titles.csv', encoding="ISO-8859-1") as nf:
    for col1,col2,*col3 in csv.reader(nf):
        #s = ''.join(col3).lower().replace(" ","")
        #s = s[:(s.find('(') if s.find('(') !=-1 else len(s))]
        nf_movies.append((''.join(col3)+"("+col2+")").lower().replace(" ",""))

ml_movies = [];
with open('data/MovieLens/movies.csv') as ml:
    for col1,col2,*col3 in csv.reader(ml):
        s = ''.join(col2).lower().replace(" ","")
        loc = s.find(",the(")
        if loc !=-1:
            s = "the" + s[:loc] + s[loc+4:]
        else:
            loc=s.find(",a(")
            if loc != -1:
                s = "a" + s[:loc] + s[loc+2:] 
            else:
                loc=s.find(",an(")
                if loc != -1:
                    s = "an" + s[:loc] + s[loc+3:] 
        ml_movies.append((col1, s))
        
print(len(nf_movies))
print(len(ml_movies))

17770
27278


In [6]:
matches = {}
#revmatches ={}
count = 0
for movie in range(len(nf_movies)):
    #if any(nf_movies[movie] in q for s,q in ml_movies):
    for label, title in ml_movies:
        if nf_movies[movie] == title and nf_movies[movie] not in ["pinocchio(2002)", "lastmanstanding(1996)", "emma(1996)","hamlet(2000)", "hamlet(1990)", "menwithguns(1997)"]:
#            if label in matches.values():
#                print("FUCKED BY:", nf_movies[movie], "and", title, nf_movies[revmatches[label]-1])
#                count +=1
            matches[movie + 1 ] = label
#            revmatches[label] = movie +1
            break
        #matches.setdefault((int(s) for s,q in ml_movies if nf_movies[movie] in q), movie+1)

        
print("I found", len(matches), "matches.")
print(count)

I found 6133 matches.
0


In [7]:
# Remove movies
idx2drop =[]
for ii,jj in enumerate(db['movieId']):
    if str(jj) not in matches:
        idx2drop.append(ii)
db.drop(db.index[idx2drop], inplace=True)
# for key in matches:
# db[db.movieId in matches.keys()]

In [8]:
print(db.shape)
print(len(idx2drop))
print(idx2drop[1:20])

(0, 4)
495702
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [9]:
z = []
#df = pd.DataFrame(columns=["userId","movieId","rating","timestamp"])
for i in matches:
    print(i)
    tmp = pd.read_csv("data/Netflix/training_set/mv_" + format(i,'07d')+".txt", header=None,
                         names=["userId","rating","timestamp"])
    tmp["movieId"] = matches[i]
    z.append(tmp)

12
17
18
24
26
30
45
47
50
52
55
57
58
63
67
77
78
79
80
82
83
89
104
108
110
117
118
122
123
125
127
129
143
147
148
152
155
156
161
166
171
173
175
181
186
187
189
191
197
199
200
201
212
222
225
228
235
240
241
243
246
247
249
250
252
253
257
261
262
268
269
271
273
274
275
283
285
286
289
290
294
295
297
299
305
309
311
312
313
319
322
326
327
329
330
331
334
335
336
337
341
344
348
349
351
353
357
358
362
363
371
374
375
378
380
381
384
385
386
390
393
395
397
400
401
406
408
413
416
422
424
425
426
429
433
436
439
440
442
443
445
453
457
459
467
471
475
482
483
486
489
490
491
492
499
505
507
508
511
513
516
517
518
523
524
525
526
527
528
529
530
532
533
535
542
547
548
549
550
551
552
556
557
563
564
566
569
570
571
574
578
585
587
599
607
612
614
615
626
629
630
632
636
638
641
643
645
646
647
652
658
659
660
662
664
665
674
680
681
682
686
689
692
693
701
705
706
708
716
719
720
723
725
728
731
733
734
740
743
746
749
750
754
755
758
760
763
766
767
770
773
785
788
789
794
79

5035
5036
5038
5039
5041
5042
5045
5047
5050
5054
5055
5056
5059
5062
5063
5064
5069
5071
5074
5075
5076
5078
5084
5085
5090
5093
5094
5097
5099
5104
5105
5106
5109
5112
5113
5116
5117
5118
5121
5129
5130
5133
5135
5137
5139
5140
5141
5143
5146
5148
5149
5151
5154
5157
5162
5169
5170
5176
5179
5180
5181
5186
5187
5190
5191
5206
5207
5210
5215
5220
5221
5225
5226
5227
5230
5236
5237
5239
5242
5243
5244
5250
5253
5254
5255
5256
5263
5272
5275
5277
5278
5280
5285
5287
5293
5296
5298
5305
5308
5313
5316
5317
5318
5320
5325
5327
5328
5330
5333
5336
5339
5340
5341
5342
5344
5345
5351
5352
5355
5360
5361
5363
5366
5367
5371
5372
5375
5378
5380
5387
5390
5391
5399
5401
5402
5403
5405
5415
5416
5417
5422
5425
5427
5429
5430
5434
5435
5436
5443
5444
5447
5448
5455
5462
5464
5465
5468
5469
5472
5473
5474
5476
5477
5487
5493
5498
5499
5503
5505
5506
5509
5511
5516
5518
5521
5531
5532
5537
5538
5542
5543
5545
5546
5547
5548
5554
5557
5560
5561
5562
5566
5568
5569
5571
5574
5576
5583
5584
5586
5587


9916
9921
9926
9927
9928
9935
9936
9939
9940
9950
9952
9954
9955
9956
9958
9959
9960
9963
9964
9965
9966
9967
9974
9977
9979
9981
9985
9986
9987
9988
9989
9991
9992
9996
9998
10000
10003
10012
10013
10018
10019
10022
10024
10027
10033
10034
10037
10038
10040
10041
10043
10045
10046
10051
10053
10054
10055
10066
10073
10077
10078
10082
10083
10084
10094
10098
10100
10101
10102
10105
10108
10109
10110
10111
10114
10119
10123
10125
10126
10128
10130
10131
10132
10143
10144
10149
10155
10159
10162
10165
10170
10171
10172
10173
10174
10176
10177
10179
10186
10189
10190
10191
10192
10198
10203
10208
10209
10210
10212
10214
10217
10221
10222
10225
10229
10230
10232
10234
10236
10240
10241
10243
10249
10253
10254
10255
10269
10271
10274
10275
10282
10294
10296
10298
10300
10302
10304
10305
10307
10309
10320
10325
10327
10337
10338
10341
10343
10344
10345
10346
10351
10353
10358
10359
10360
10364
10371
10372
10375
10378
10379
10386
10388
10392
10394
10395
10397
10403
10407
10409
10413
10414
104

13765
13767
13771
13772
13775
13776
13783
13784
13785
13786
13787
13791
13793
13795
13798
13804
13805
13806
13809
13810
13812
13813
13817
13824
13827
13829
13831
13833
13838
13840
13845
13849
13851
13852
13853
13856
13859
13860
13863
13865
13867
13868
13870
13871
13873
13882
13883
13886
13887
13888
13896
13898
13899
13901
13910
13912
13914
13916
13917
13918
13921
13923
13924
13925
13930
13934
13936
13938
13941
13952
13955
13956
13959
13964
13968
13970
13973
13974
13975
13978
13979
13981
13982
13986
13987
13990
13991
13993
13996
13998
14002
14003
14006
14009
14010
14013
14016
14025
14027
14039
14042
14047
14050
14057
14062
14063
14064
14065
14066
14068
14069
14070
14072
14074
14091
14098
14103
14104
14109
14112
14113
14118
14119
14121
14131
14132
14134
14137
14139
14141
14144
14146
14149
14153
14154
14155
14158
14161
14164
14166
14167
14169
14173
14174
14175
14176
14178
14181
14183
14185
14186
14188
14192
14193
14194
14197
14200
14203
14209
14211
14212
14213
14215
14218
14219
14220
1422

17714
17716
17719
17721
17724
17726
17730
17734
17754
17756
17758
17759
17760
17761
17762
17763
17764
17769
17770
