# MovieLens dataset report
## Prepare

In [1]:
from movielens_analysis import Movies, Links, Ratings, Tags

In [2]:
%ls
%ls ml-latest-small

README.txt              movielens_report.ipynb  ratings.py
links.csv               movies.csv              tags.csv
[31mlinks.py[m[m*               movies.py               [31mtags.py[m[m*
movielens_analysis.py   ratings.csv             test
ls: ml-latest-small: No such file or directory


In [2]:
MOVIES_CSV = 'movies.csv'
LINKS_CSV = 'links.csv'
RATINGS_CSV = 'ratings.csv'
TAGS_CSV = 'tags.csv'

## Movies analysis

### Distribution by release year

In [3]:
movies = Movies(MOVIES_CSV)

In [4]:
%timeit movies.dist_by_release()

42.2 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
dist_by_release = movies.dist_by_release()

tmp = list(dist_by_release.items())
for index in range(len(tmp)):
    print(f'{tmp[index][0]} : {tmp[index][1]}', end='\t')
    index += 1
    if index % 5 == 0:
        print()

print('\n')

2002 : 311	2006 : 295	2001 : 294	2007 : 284	2000 : 283	
2009 : 282	2003 : 279	2004 : 279	2014 : 278	1996 : 276	
2015 : 274	2005 : 273	2008 : 269	1999 : 263	1997 : 260	
1995 : 259	1998 : 258	2011 : 254	2010 : 247	2013 : 239	
1994 : 237	2012 : 233	2016 : 218	1993 : 198	1992 : 167	
1988 : 165	1987 : 153	1990 : 147	1991 : 147	2017 : 147	
1989 : 142	1986 : 139	1985 : 126	1984 : 101	1981 : 92	
1980 : 89	1982 : 87	1983 : 83	1979 : 69	1977 : 63	
1973 : 59	1978 : 59	1965 : 47	1971 : 47	1974 : 45	
1976 : 44	1964 : 43	1967 : 42	1968 : 42	1975 : 42	
1966 : 42	2018 : 41	1962 : 40	1972 : 39	1963 : 39	
1959 : 37	1960 : 37	1955 : 36	1969 : 35	1961 : 34	
1970 : 33	1957 : 33	1958 : 31	1953 : 30	1956 : 30	
1940 : 25	1949 : 25	1954 : 23	1942 : 23	1939 : 23	
1946 : 23	1951 : 22	1950 : 21	1947 : 20	1948 : 20	
1941 : 18	1936 : 18	1945 : 17	1937 : 16	1952 : 16	
1944 : 16	1938 : 15	1931 : 14	1935 : 13	1933 : 12	
1934 : 11	1943 : 10	1932 : 9	1927 : 7	1930 : 5	
1926 : 5	1924 : 5	1929 : 4	1928 : 4	1925 : 4	
1923 

### Distribution of genres

In [6]:
%timeit movies.dist_by_genres()

49.5 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
movies = Movies(MOVIES_CSV)

for key, value in movies.dist_by_genres().items():
    print(f'{key} : {value}')

print()

Drama : 4361
Comedy : 3756
Thriller : 1894
Action : 1828
Romance : 1596
Adventure : 1263
Crime : 1199
Sci-Fi : 980
Horror : 978
Fantasy : 779
Children : 664
Animation : 611
Mystery : 573
Documentary : 440
War : 382
Musical : 334
Western : 167
IMAX : 158
Film-Noir : 87



### 30 most genres films

In [8]:
%timeit movies.most_genres(30)

35.4 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
movies = Movies(MOVIES_CSV)

print(f'{"Film":<70}Ganres number')
for key, value in movies.most_genres(30).items():
    print(f'{key:<70}{value}')

print()

Film                                                                  Ganres number
Rubber (2010)                                                         10
Patlabor: The Movie (Kidô keisatsu patorebâ: The Movie) (1989)        8
Mulan (1998)                                                          7
Who Framed Roger Rabbit? (1988)                                       7
Osmosis Jones (2001)                                                  7
Interstate 60 (2002)                                                  7
Robots (2005)                                                         7
Pulse (2006)                                                          7
Aqua Teen Hunger Force Colon Movie Film for Theaters (2007)           7
Enchanted (2007)                                                      7
Aelita: The Queen of Mars (Aelita) (1924)                             7
Inception (2010)                                                      7
Tangled (2010)                                     

## Links analysis

### Get imdb information

In [10]:
links = Links(LINKS_CSV)

In [None]:
%timeit links.get_imdb(['1', '3', '5', '7', '15'], ['Director', 'Budget', 'Gross worldwide', 'Runtime'])

In [None]:
fields = ['Director', 'Budget', 'Gross worldwide', 'Runtime']
imdb_info = links.get_imdb(['1', '3', '5', '7', '15'], fields)

print(imdb_info)
print('MovieId', *fields, sep='\t\t')
for movie in imdb_info:
    print(movie, sep='\t\t')

[]
MovieId		Director		Budget		Gross worldwide		Runtime


### Top directors

In [None]:
%timeit -r 1 -n 1 links.top_directors(20)

33.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
top_directors = links.top_directors(20)

print(f'{"Director":<20}Films count')
for key, value in top_directors.items():
    if key is None:
        key = 'Null'
    print(f'{key:<20}{value}')

Director            Films count
1                   17
8                   14
0                   12
9                   12
7                   10
4                   8
6                   6
2                   6
3                   6
5                   2


### Most expensive films

In [15]:
%timeit links.most_expensive(20)

23.9 s ± 544 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
most_expensive = links.most_expensive(20)

print(f'{"Film":<40}Budget')
for key, value in most_expensive.items():
    print(f'{key:<40}{value}')

Film                                    Budget
7                                       Undisputed 2: Last Man Standing
2                                       The King of Masks
4                                       Shallow Hal
1                                       Hear My Song
8                                       Die Schlangengrube
6                                       Capturing the Friedmans


### Most profitable films

In [17]:
%timeit links.most_profitable(20)

ValueError: could not convert string to float: 'Betting on Zero'

In [None]:
most_profitable = links.most_profitable(20)

print(f'{"Film":<40}Profit')
for key, value in most_profitable.items():
    print(f'{key:<40}{value}')

Film                                    Profit
Aliens vs. Predator: Requiem            90290885.0
Law Abiding Citizen                     77944208.0
In the Mood for Love                    14202626.0
Steve Jobs                              4441873.0
Munna Bhai M.B.B.S.                     3137717.0
The Moustache                           3044772.0
Hyena Road                              87769.0
City of Women                           12517.0
Watch Out for the Automobile            10155.0
 -1                                     0.0
Rob Zombie Presents: The Haunted World of El Superbeasto0.0
Ink                                     0.0
Lou Gehrig                              0.0
Diskretni sobar Godfri                  -656001.0
Bring Me the Head of Alfredo Garcia     -1481106.0
Poolhall Junkies                        -3436289.0
Welcome to the Jungle                   -3500001.0
The Whole Ten Yards                     -13829329.0
A Sound of Thunder                      -68334535.0


### Longest films

In [None]:
%timeit links.longest(20)

24 s ± 690 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
longest = links.longest(20)

print(f'{"Film":<40}Runtime')
for key, value in longest.items():
    print(f'{key:<40}{value}')

Film                                    Runtime
Lou Gehrig                              2 hours 8 minutes
Munna Bhai M.B.B.S.                     2 hours 36 minutes
Steve Jobs                              2 hours 2 minutes
City of Women                           2 hours 19 minutes
Hyena Road                              2 hours
Bring Me the Head of Alfredo Garcia     1 hour 52 minutes
A Sound of Thunder                      1 hour 50 minutes
Law Abiding Citizen                     1 hour 49 minutes
Ink                                     1 hour 47 minutes
Poolhall Junkies                        1 hour 39 minutes
The Whole Ten Yards                     1 hour 38 minutes
In the Mood for Love                    1 hour 38 minutes
Welcome to the Jungle                   1 hour 35 minutes
Watch Out for the Automobile            1 hour 34 minutes
Aliens vs. Predator: Requiem            1 hour 34 minutes
Diskretni sobar Godfri                  1 hour 34 minutes
The Moustache                   

### Cost per unit top

In [21]:
%timeit links.top_cost_per_minute(20)




ValueError: could not convert string to float: 

In [25]:
top_cost_per_minute = links.top_cost_per_minute(20)

print(f'{"Film":<40}Cost per minute')
for key, value in top_cost_per_minute.items():
    print(f'{key:<40}{value}')




ValueError: could not convert string to float: 

## Ratings.Movies analysis
### Distribution of ratings count by year

In [18]:
rating = Ratings(RATINGS_CSV, MOVIES_CSV)

In [19]:
%timeit rating.get_movies().dist_by_year()

435 ms ± 6.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
dist_by_year = rating.get_movies().dist_by_year()

print(f'{"Year":<6}Ratings count')
for key, value in dist_by_year.items():
    print(f'{key:<6}{value}')

Year  Ratings count
1998  507
2014  1439
2013  1664
2011  1690
1997  1916
2010  2300
1999  2439
2004  3279
2002  3478
2001  3922
2003  4014
2006  4059
2009  4158
2008  4351
2012  4657
2005  5813
1996  6040
2018  6418
2015  6616
2016  6702
2007  7114
2017  8199
2000  10061


### Distribution of ratings count by rating value

In [None]:
%timeit movies_ratings.dist_by_rating()

260 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
dist_by_rating = movies_ratings.dist_by_rating()

print(f'{"Rating value":<15}Ratings count')
for key, value in dist_by_rating.items():
    print(f'{key:<15}{value}')

Rating value   Ratings count
0.5            1370
1.0            2811
1.5            1791
2.0            7551
2.5            5550
3.0            20047
3.5            13136
4.0            26818
4.5            8551
5.0            13211


### Top movies by rating (average)

In [None]:
%timeit movies_ratings.top_by_ratings(30)

281 ms ± 6.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
top_by_ratings = movies_ratings.top_by_ratings(30)

print(f'{"Movie":<75}Average rating')
for key, value in top_by_ratings.items():
    print(f'{key:<75}{value}')

Movie                                                                      Average rating
The Jinx: The Life and Deaths of Robert Durst (2015)                       5.0
Galaxy of Terror (Quest) (1981)                                            5.0
Alien Contamination (1980)                                                 5.0
I'm the One That I Want (2000)                                             5.0
Lesson Faust (1994)                                                        5.0
Assignment, The (1997)                                                     5.0
Mephisto (1981)                                                            5.0
Black Mirror                                                               5.0
Dylan Moran: Monster (2004)                                                5.0
Bill Hicks: Revelations (1993)                                             5.0
My Sassy Girl (Yeopgijeogin geunyeo) (2001)                                5.0
Strictly Sexual (2008)                   

### Top movies by rating (median)

In [None]:
%timeit movies_ratings.top_by_ratings(30, metric=Statistics.median)

296 ms ± 9.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
top_by_ratings = movies_ratings.top_by_ratings(30, metric=Statistics.median)

print(f'{"Movie":<75}Median rating')
for key, value in top_by_ratings.items():
    print(f'{key:<75}{value}')

Movie                                                                      Median rating
The Jinx: The Life and Deaths of Robert Durst (2015)                       5.0
Galaxy of Terror (Quest) (1981)                                            5.0
Alien Contamination (1980)                                                 5.0
Troll 2 (1990)                                                             5.0
I'm the One That I Want (2000)                                             5.0
Chorus Line, A (1985)                                                      5.0
Guess Who's Coming to Dinner (1967)                                        5.0
Children of the Corn IV: The Gathering (1996)                              5.0
Band of Brothers (2001)                                                    5.0
Lesson Faust (1994)                                                        5.0
Assignment, The (1997)                                                     5.0
Mephisto (1981)                           

### Top controversial movies

In [None]:
%timeit movies_ratings.top_controversial(30)

318 ms ± 9.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
top_controversial = movies_ratings.top_controversial(30)

print(f'{"Movie":<75}Rating variance')
for key, value in top_controversial.items():
    print(f'{key:<75}{value}')

Movie                                                                      Rating variance
Troll 2 (1990)                                                             5.42
Ivan's Childhood (a.k.a. My Name is Ivan) (Ivanovo detstvo) (1962)         5.03
The Jinx: The Life and Deaths of Robert Durst (2015)                       5.0
Galaxy of Terror (Quest) (1981)                                            5.0
Alien Contamination (1980)                                                 5.0
I'm the One That I Want (2000)                                             5.0
Assignment, The (1997)                                                     5.0
Mephisto (1981)                                                            5.0
Black Mirror                                                               5.0
Dylan Moran: Monster (2004)                                                5.0
Bill Hicks: Revelations (1993)                                             5.0
My Sassy Girl (Yeopgijeogin geunyeo) (

## Ratings.Users analysis
### Distribution of users by ratings count

In [None]:
users_ratings = Ratings.Users(ratings, movies)

In [None]:
%timeit users_ratings.dist_by_ratings_number()

252 ms ± 4.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
dist_by_ratings_number = users_ratings.dist_by_ratings_number()

print(f'{"User":<8}Number of ratings')
for key, value in dist_by_ratings_number.items():
    print(f'{key:<8}{value}')

User    Number of ratings
53      20
147     20
189     20
194     20
207     20
257     20
278     20
320     20
406     20
431     20
442     20
569     20
576     20
595     20
26      21
37      21
49      21
87      21
157     21
245     21
281     21
293     21
324     21
364     21
439     21
507     21
547     21
549     21
598     21
60      22
118     22
120     22
127     22
138     22
192     22
214     22
407     22
433     22
467     22
478     22
494     22
531     22
544     22
35      23
145     23
163     23
251     23
299     23
329     23
394     23
397     23
423     23
485     23
545     23
568     23
574     23
92      24
175     24
180     24
231     24
289     24
508     24
518     24
55      25
173     25
206     25
228     25
258     25
333     25
360     25
392     25
529     25
25      26
81      26
150     26
158     26
172     26
208     26
218     26
355     26
459     26
515     26
516     26
519     26
548     26
205     27
250     27
296     27
461   

### Distribution of users by ratings values (average)

In [None]:
%timeit users_ratings.dist_by_ratings_values()

237 ms ± 5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
dist_by_ratings_values = users_ratings.dist_by_ratings_values()

print(f'{"User":<8}Average rating value')
for key, value in dist_by_ratings_values.items():
    print(f'{key:<8}{value}')

User    Average rating value
442     1.27
139     2.14
508     2.15
153     2.22
567     2.25
311     2.34
298     2.36
517     2.39
308     2.43
3       2.44
22      2.57
255     2.57
571     2.57
297     2.6
19      2.61
294     2.61
287     2.62
293     2.62
36      2.63
333     2.64
428     2.64
599     2.64
307     2.67
535     2.67
160     2.71
245     2.71
149     2.72
431     2.73
365     2.75
386     2.75
217     2.76
81      2.77
50      2.78
481     2.81
478     2.82
55      2.84
368     2.84
448     2.85
214     2.86
230     2.86
329     2.87
207     2.88
510     2.9
181     2.94
338     2.94
342     2.94
461     2.94
394     2.96
600     2.99
133     3.0
163     3.0
316     3.0
28      3.02
489     3.02
54      3.03
94      3.04
132     3.04
47      3.05
314     3.05
395     3.05
416     3.07
76      3.08
384     3.09
427     3.1
576     3.1
127     3.11
262     3.11
259     3.12
552     3.12
608     3.13
146     3.14
324     3.14
396     3.14
487     3.14
288     3.15
78 

### Distribution of users by ratings values (median)

In [None]:
%timeit users_ratings.dist_by_ratings_values(metric=Statistics.median)

242 ms ± 6.28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
dist_by_ratings_values = users_ratings.dist_by_ratings_values(metric=Statistics.median)

print(f'{"User":<8}Median of rating value')
for key, value in dist_by_ratings_values.items():
    print(f'{key:<8}{value}')

User    Median of rating value
3       0.5
442     1.0
139     2.0
153     2.0
255     2.0
293     2.0
329     2.0
508     2.0
567     2.0
571     2.0
36      2.5
287     2.5
298     2.5
307     2.5
308     2.5
517     2.5
599     2.5
311     2.75
431     2.75
478     2.75
6       3.0
8       3.0
9       3.0
14      3.0
19      3.0
22      3.0
26      3.0
28      3.0
38      3.0
44      3.0
47      3.0
50      3.0
54      3.0
55      3.0
78      3.0
81      3.0
94      3.0
102     3.0
109     3.0
117     3.0
120     3.0
121     3.0
126     3.0
132     3.0
133     3.0
134     3.0
136     3.0
145     3.0
146     3.0
149     3.0
150     3.0
157     3.0
160     3.0
163     3.0
165     3.0
170     3.0
173     3.0
174     3.0
181     3.0
214     3.0
217     3.0
222     3.0
230     3.0
232     3.0
242     3.0
245     3.0
262     3.0
265     3.0
268     3.0
270     3.0
271     3.0
283     3.0
288     3.0
294     3.0
297     3.0
314     3.0
315     3.0
316     3.0
321     3.0
323     3.0
324   

### Top of users by variance of their ratings

In [None]:
%timeit users_ratings.top_by_variance(30)

288 ms ± 9.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
top_by_variance = users_ratings.top_by_variance(30)

print(f'{"User":<8}Variance of ratings')
for key, value in top_by_variance.items():
    print(f'{key:<8}{value}')

User    Variance of ratings
70946   5.42
32892   5.03
131724  5.0
5746    5.0
6835    5.0
3851    5.0
1631    5.0
2075    5.0
176601  5.0
92494   5.0
102217  5.0
27523   5.0
67618   5.0
8804    5.0
26350   5.0
31522   5.0
1140    5.0
6402    5.0
8238    5.0
25887   5.0
34312   5.0
44851   5.0
47736   5.0
50999   5.0
53280   5.0
53355   5.0
53578   5.0
60737   5.0
69211   5.0
69469   5.0


## Tags analysis

### Most words

In [None]:
tags = Tags(TAGS_CSV)

In [None]:
%timeit tags.most_words(30)

1.55 ms ± 40 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
dist_by_release = tags.most_words(30)

print(f'{"Tag":<90}Number of words')
for key, value in dist_by_release.items():
    print(f'{key:<90}{value}')

Tag                                                                                       Number of words
Something for everyone in this one... saw it without and plan on seeing it with kids!     16
the catholic church is the most corrupt organization in history                           10
villain nonexistent or not needed for good story                                          8
It was melodramatic and kind of dumb                                                      7
06 Oscar Nominated Best Movie - Animation                                                 7
Oscar (Best Music - Original Score)                                                       6
Oscar (Best Effects - Visual Effects)                                                     6
stop using useless characters for filler                                                  6
rich guy - poor girl                                                                      5
stop looking at me swan                                         

### Longest

In [None]:
%timeit tags.longest(30)

1 ms ± 17.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
longest = tags.longest(30)

print('Tag\n---------')
for tag in longest:
    print(tag)

Tag
---------
Something for everyone in this one... saw it without and plan on seeing it with kids!
the catholic church is the most corrupt organization in history
villain nonexistent or not needed for good story
r:disturbing violent content including rape
06 Oscar Nominated Best Movie - Animation
stop using useless characters for filler
Academy award (Best Supporting Actress)
Oscar (Best Effects - Visual Effects)
audience intelligence underestimated
r:sustained strong stylized violence
It was melodramatic and kind of dumb
Oscar (Best Music - Original Score)
Oscar (Best Supporting Actress)
start of a beautiful friendship
assassin-in-training (scene)
avant-garde romantic comedy
political right versus left
Everything you want is here
Oscar (Best Cinematography)
r:disturbing violent images
representation of children
Not available from Netflix
stupid is as stupid does
Rita Hayworth can dance!
setting:space/space ship
beautiful cinematography
r:strong bloody violence
coulda been a contender

### Most words and longest

In [None]:
%timeit tags.most_words_and_longest(30)

2.81 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
most_words_and_longest = tags.most_words_and_longest(30)

print('Tag\n---------')
for tag in most_words_and_longest:
    print(tag)

Tag
---------
r:sustained strong stylized violence
Everything you want is here
Something for everyone in this one... saw it without and plan on seeing it with kids!
r:disturbing violent content including rape
It was melodramatic and kind of dumb
Oscar (Best Music - Original Score)
the catholic church is the most corrupt organization in history
stupid is as stupid does
stop using useless characters for filler
villain nonexistent or not needed for good story
Oscar (Best Effects - Visual Effects)
06 Oscar Nominated Best Movie - Animation
Academy award (Best Supporting Actress)
start of a beautiful friendship
political right versus left


### Most popular

In [None]:
%timeit tags.most_popular(20)

747 µs ± 57.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
most_popular = tags.most_popular(20)

print(f'{"Tag":<30}Usage number')
for key, value in most_popular.items():
    print(f'{key:<30}{value}')

Tag                           Usage number
In Netflix queue              131
atmospheric                   36
superhero                     24
thought-provoking             24
funny                         23
surreal                       23
Disney                        23
religion                      22
sci-fi                        21
dark comedy                   21
psychology                    21
quirky                        21
suspense                      20
twist ending                  19
visually appealing            19
crime                         19
politics                      18
time travel                   16
mental illness                16
music                         16


### Tags with (some word)

In [None]:
word_for_tag = 'history'

In [None]:
%timeit tags.tags_with(word_for_tag)

577 µs ± 18.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
tags_with = tags.tags_with(word_for_tag)

print(f'Tags with {word_for_tag}\n---------')
for tag in tags_with:
    print(tag)

Tags with history
---------
film history
history
the catholic church is the most corrupt organization in history
