![image.png](attachment:8b79f452-1cc3-432a-a5fe-269e7bfd17d3.png)

In [1]:
from sqlalchemy import create_engine
import pandas as pd

In [2]:
import pymysql
pymysql.install_as_MySQLdb()

In [3]:
from sqlalchemy_utils import database_exists, create_database

In [4]:
# Create the sqlalchemy engine and connection
username = "root"
password = "root" 
# password = quote_plus("Myp@ssword!") # Use the quote function if you have special chars in password
db_name = "movies"
connection = f"mysql+pymysql://root:root@localhost/movies"
engine = create_engine(connection)
conn = engine.connect()

In [5]:
##check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

It exists!


# Showing Tables

In [6]:
# Preview the names of all tables 
# I can see that all the ERD tables are showing up here correctly according to instructions and match
q = '''SHOW TABLES;'''
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres


# Importing Data into Notebook

In [7]:
# Read in the ratings csv data and preview
ratings = pd.read_csv('Data/Title_ratings.csv')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71900 entries, 0 to 71899
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175
2,tt0069049,6.7,7754
3,tt0088751,5.2,336
4,tt0096056,5.6,846


In [8]:
# Read in the title basics dsv data and preview
Title_Basics = pd.read_csv('Data/Title_Basics.csv')
Title_Basics.info()
Title_Basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  int64  
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86979 non-null  int64  
 8   genres          86979 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


______________

# Loading data into tables

__________________________

## - Ratings (table check)

In [9]:
# looking at Database (ERD)/SQL table
# This information will ensure our DataFrame has the correct field names and datatypes before inserting the data into our pre-existing database.
# To easily compare the (ERD/database/SQL table) against our DataFrame (CSV file), we will save the output DataFrame as a variable called "describe."
q = '''DESCRIBE ratings;'''
describe_rt = pd.read_sql(q, conn)
describe_rt

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,auto_increment
1,average_rating,varchar(45),YES,,,
2,number_of_votes,int,YES,,,


In [10]:
# looking at ERD/database/SQL table
# Let's inspect the names of the fields in the database and compare them to the columns in our DataFrame:
describe_rt['Field'].values

array(['tconst', 'average_rating', 'number_of_votes'], dtype=object)

In [11]:
# Checking dataframe's (csv data) columns
ratings.columns

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')

In [12]:
# Rename columns to match ERD/SQL table  {dataframe(csv):ERD_database}
rename_map = {"averageRating":"average_rating",
             "numVotes":"number_of_votes"}
ratings = ratings.rename(rename_map,axis=1)
ratings.head(2)

Unnamed: 0,tconst,average_rating,number_of_votes
0,tt0035423,6.4,87153
1,tt0062336,6.4,175


In [13]:
# Reviewing (ERD/database/SQL table's) data types (This will compare datatypes in the (ERD)database vs (CSV)our dataframe)
# Note that the datatypes will not be an exact match but a general match. 
# For example, SQL's VARCHAR is equivalent to "object" or "string."
# SQL's Decimal = pandas' float
# SQL's date = pandas' datetime
describe_rt[['Field','Type']]

Unnamed: 0,Field,Type
0,tconst,int
1,average_rating,varchar(45)
2,number_of_votes,int


In [14]:
# Reviewing dataframe's/(csv data) data types
ratings.dtypes

tconst              object
average_rating     float64
number_of_votes      int64
dtype: object

**************** NEED TO UPDATE DTYPES************************

_____________________

# - basics (table check)

In [15]:
# looking at Database (ERD)/SQL table
# This information will ensure our DataFrame has the correct field names and datatypes before inserting the data into our pre-existing database.
# To easily compare the (ERD/database/SQL table) against our DataFrame (CSV file), we will save the output DataFrame as a variable called "describe."
q = '''DESCRIBE Title_Basics;'''
describe = pd.read_sql(q, conn)
describe

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,tconst,int,NO,PRI,,auto_increment
1,primary_title,varchar(45),YES,,,
2,start_year,datetime,YES,,,
3,runtime,int,YES,,,
4,ratings_tconst,int,NO,PRI,,


In [16]:
# looking at title_basics ERD/database/SQL table
# Let's inspect the names of the fields in the database and compare them to the columns in our DataFrame:
describe['Field'].values

array(['tconst', 'primary_title', 'start_year', 'runtime',
       'ratings_tconst'], dtype=object)

In [17]:
# Checking dataframe's (csv data) columns
Title_Basics.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [18]:
# Rename columns to match ERD/SQL table  {dataframe(csv):ERD_database}
rename_map = {"primaryTitle":"primary_title",
             "startYear":"start_year",
             "runtimeMinutes":"runtime"}
Title_Basics = Title_Basics.rename(rename_map,axis=1)
Title_Basics.head(2)

Unnamed: 0,tconst,titleType,primary_title,originalTitle,isAdult,start_year,endYear,runtime,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama


In [19]:
# Reviewing (ERD/database/SQL table's) data types (This will compare datatypes in the (ERD)database vs (CSV)our dataframe)
# Note that the datatypes will not be an exact match but a general match. 
# For example, SQL's VARCHAR is equivalent to "object" or "string."
# SQL's Decimal = pandas' float
# SQL's date = pandas' datetime
describe[['Field','Type']]

Unnamed: 0,Field,Type
0,tconst,int
1,primary_title,varchar(45)
2,start_year,datetime
3,runtime,int
4,ratings_tconst,int


In [20]:
# Reviewing dataframe's/(csv data) data types
# Need this to match the ERD/database
Title_Basics.dtypes

tconst            object
titleType         object
primary_title     object
originalTitle     object
isAdult            int64
start_year       float64
endYear          float64
runtime            int64
genres            object
dtype: object

In [21]:
# Converting start_year to datetime dtype
Title_Basics['start_year'] = pd.to_datetime(Title_Basics['start_year'])
Title_Basics.dtypes

tconst                   object
titleType                object
primary_title            object
originalTitle            object
isAdult                   int64
start_year       datetime64[ns]
endYear                 float64
runtime                   int64
genres                   object
dtype: object

In [25]:
# Converting tconst to datetime dtype
Title_Basics['tconst'] = pd.to_integer(Title_Basics['tconst'])
Title_Basics.dtypes

AttributeError: module 'pandas' has no attribute 'to_integer'

# Inserting Data into the Tables

## - Ratings

In [None]:
# ratings.to_sql("ratings",conn,index=False, if_exists='append')

In [None]:
# confirm the data has been added
#q = """SELECT * FROM ratings;"""
# pd.read_sql(q,conn)

## -Title Basics

In [None]:
# Title_Basics.to_sql("Title_Basics",conn,index=False, if_exists='append')

In [None]:
# confirm the data has been added
#q = """SELECT * FROM Title_Basics;"""
# pd.read_sql(q,conn)