* grouplens MovieLens 2018 100k data: https://grouplens.org/datasets/movielens/latest/
* Readme: http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html


In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine
import sqlite3

In [2]:
conn = sqlite3.connect('2018Movies.sqlite')


In [3]:
c = conn.cursor()
conn.execute('drop table if exists movie_ratings')

<sqlite3.Cursor at 0x10fb9e420>

In [4]:

c.execute('''CREATE TABLE movie_ratings
    (id INT PRIMARY KEY,
    userId INT,
    movieId INT,
    rating INT,
    no_genre INT,
    Action INT,
    Adventure INT, 
    Animation INT, 
    Children INT, 
    Comedy INT, 
    Crime INT,
    Documentary INT, 
    Drama INT, 
    Fantasy INT, 
    Film_Noir INT, 
    Horror INT, 
    IMAX INT,
    Musical INT, 
    Mystery INT, 
    Romance INT, 
    Sci_Fi INT, 
    Thriller INT, 
    War INT,
    Western INT)''')

<sqlite3.Cursor at 0x10fb9e490>

In [5]:
conn.commit()


In [6]:
# importing the ratings file
ratings = pd.read_csv('ml-100k/ratings.csv', encoding='latin-1')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# importing movies file
items = pd.read_csv('ml-100k/movies.csv', encoding='latin-1')
items.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# merge users and ratings
df = pd.merge(ratings, items, on="movieId")
df.sort_values("userId").head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12117,1,2389,2.0,964983094,Psycho (1998),Crime|Horror|Thriller
12126,1,2395,5.0,964981093,Rushmore (1998),Comedy|Drama
12182,1,2406,4.0,964982310,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
776,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller


In [9]:
# check for null values
df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [10]:
# used one hot encoding to transform genres into features
genres = pd.get_dummies(df.genres.str.split('|',expand=True).stack()).sum(level=0)
genres.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [11]:
# merge genres back to the original df
df = pd.concat([df,genres], axis=1)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# use drop to get remaining features
data = df.drop(['timestamp', 'title', 'genres'],axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# datacsv = df.to_csv("ml-100k/2018100k.csv")

In [14]:

df = df.rename(index=str, columns={"(no genres listed)":"no_genre", "Film-Noir":"Film_Noir", "Sci-Fi":"Sci_Fi"})

In [15]:
df.to_sql('movie_ratings', conn, if_exists='append', index=True, index_label='id')
