In [1]:
import numpy as np
import pandas as pd
import sqlite3
import os

In [2]:
movies = pd.read_csv("ml-1m/movies.dat", 
                 sep="::", 
                 skiprows=0, 
                 names=['MovieID','Title','Genres'],
                 engine='python'                 
                )
users = pd.read_csv("ml-1m/users.dat", 
                 sep="::", 
                 skiprows=0, 
                 names=['UserID','Gender','Age','Occupation','ZipCode'],
                 engine='python'                 
                )
ratings = pd.read_csv("ml-1m/ratings.dat", 
                 sep="::", 
                 skiprows=0, 
                 names=['UserID','MovieID','Rating','Timestamp'],
                 engine='python'                 
                )

In [3]:
movies.head(3)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [27]:
users.head(3)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


In [32]:
ratings.head(3)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [17]:
movie_extracted = movies.Title.str.extract(r"\((\d{4})\)$", expand=True)
#movie_titles = movie_extracted[0]
movie_years = movie_extracted
movie_years.head()

Unnamed: 0,0
0,1995
1,1995
2,1995
3,1995
4,1995


In [19]:
movies["Year"] = movie_years.astype(int)
movies

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
5,6,Heat (1995),Action|Crime|Thriller,1995
6,7,Sabrina (1995),Comedy|Romance,1995
7,8,Tom and Huck (1995),Adventure|Children's,1995
8,9,Sudden Death (1995),Action,1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,1995


In [49]:
len(movies)

3883

In [51]:
movies["Years"] = movie_years

In [54]:
movies.groupby("Years").size()

Years
1919      3
1920      2
1921      1
1922      2
1923      3
1925      6
1926      8
1927      6
1928      3
1929      3
1930      7
1931      7
1932      7
1933      7
1934      7
1935      6
1936      8
1937     11
1938      6
1939     11
1940     19
1941     11
1942     13
1943     10
1944     13
1945     11
1946     13
1947     14
1948     12
1949     10
       ... 
1971     26
1972     22
1973     29
1974     28
1975     21
1976     21
1977     22
1978     30
1979     32
1980     41
1981     43
1982     50
1983     35
1984     60
1985     65
1986    104
1987     71
1988     69
1989     60
1990     77
1991     60
1992    102
1993    165
1994    257
1995    342
1996    345
1997    315
1998    337
1999    283
2000    156
Length: 81, dtype: int64

In [70]:
tabela = users.groupby(["Age","Gender"]).size().reset_index()
tabela.columns = ["Age","Gender","Count"]
tabela

Unnamed: 0,Age,Gender,Count
0,1,F,78
1,1,M,144
2,18,F,298
3,18,M,805
4,25,F,558
5,25,M,1538
6,35,F,338
7,35,M,855
8,45,F,189
9,45,M,361


In [71]:
tabela.pivot("Age","Gender","Count")

Gender,F,M
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,78,144
18,298,805
25,558,1538
35,338,855
45,189,361
50,146,350
56,102,278


In [62]:
pd.crosstab(users.Age, users.Gender)

Gender,F,M
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,78,144
18,298,805
25,558,1538
35,338,855
45,189,361
50,146,350
56,102,278


In [72]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Years
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [94]:
movie_genres=set(
    movies.Genres.str.split("|").values.sum()
)
movie_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [98]:
func = lambda x: [genre in x for genre in movie_genres])

In [96]:
movies.Genres.head()

0     Animation|Children's|Comedy
1    Adventure|Children's|Fantasy
2                  Comedy|Romance
3                    Comedy|Drama
4                          Comedy
Name: Genres, dtype: object

In [99]:
func("Adventure|Children's|Fantasy")

3

In [21]:
movies.Genres.str.split("|", expand=True)

Unnamed: 0,0,1,2,3,4,5
0,Animation,Children's,Comedy,,,
1,Adventure,Children's,Fantasy,,,
2,Comedy,Romance,,,,
3,Comedy,Drama,,,,
4,Comedy,,,,,
5,Action,Crime,Thriller,,,
6,Comedy,Romance,,,,
7,Adventure,Children's,,,,
8,Action,,,,,
9,Action,Adventure,Thriller,,,


In [22]:
movies.Genres.str.split("|", expand=True).values.flatten()

array(['Animation', "Children's", 'Comedy', ..., None, None, None],
      dtype=object)

In [25]:
pd.Series(movies.Genres.str.split("|", expand=True).values.flatten()).value_counts()

Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
dtype: int64

In [26]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [27]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [41]:
result = pd.merge(ratings,movies, on="MovieID")[["Title","Rating"]]
result.groupby("Title").mean().sort_values(by="Rating",ascending=False).head(14)

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0
One Little Indian (1973),5.0
Smashing Time (1967),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
"Gate of Heavenly Peace, The (1995)",5.0
"Baby, The (1973)",5.0
