
# Reading Files from Data Lake [MinIO] ~ Kubernetes Object-Storage
> *the most used and recommended approach, narrow your files into the lake* 


<br>
  
<img width="1200px" src ='https://owshqblobstg.blob.core.windows.net/stgfiles/png_files/dl_delta_1.png'>
  
<br>


>* generate new files to arrive on storage ~ data lake  
python3.9 cli.py 'minio-movies'

In [0]:
# set hadoop configurations at the notebook level
# same config can be achieved on spark config inside of the cluster

# all parameters to connect to a different data lake ~ minio [s3a] protocol
# data can be spread into different locations
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://143.244.222.95")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", "4SxTgRcCkLxkiBbo")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "UVwS3GwoASorqog0MwUqXSOaYcTywrSB")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.path.style.access", "True")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.fast.upload", "True")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "False")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")
sc._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
sc._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "3")
sc._jsc.hadoopConfiguration().set("fs.s3a.disable.chunked.encoding", "True")
sc._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

In [0]:
# pro-tip = remove passtrough creds of adls2 in order to access
# compatibility issues

# reading files from the landing zone of the data lake
# files stored in json format

# reading movies and ratings data 
# data is gonna be used to model our business
df_movies = spark.read.json("s3a://landing/movies/*.json")
df_ratings = spark.read.json("s3a://landing/ratings/*.json")

# register dataframe into spark sql engine
# access using sql statements
df_movies.createOrReplaceTempView("vw_movies")
df_ratings.createOrReplaceTempView("vw_ratings")

In [0]:
%sql

-- dataframes = df_movies and df_ratings were shared on sql engine by issuing the command = registerTempTable
-- you can query the dataframe using sql interface
-- applying some transformations and munging of the data before to store into bronze
-- data analysis of what you want to retrieve

SELECT mo.id,
       mo.imdb_id,
       mo.user_id,
       mo.title,
       mo.genres,
       mo.status,
       mo.release_date,
       mo.original_language,
       mo.popularity,
       mo.production_companies,
       mo.production_countries,
       mo.vote_count,
       rt.rating,
       mo.dt_current_timestamp AS dt_current_timestamp 
FROM vw_movies AS mo
INNER JOIN vw_ratings AS rt
ON mo.user_id = rt.user_id

id,imdb_id,user_id,title,genres,status,release_date,original_language,popularity,production_companies,production_countries,vote_count,rating,dt_current_timestamp
41869,tt0091106,225,The George McKenna Story,"[{'id': 18, 'name': 'Drama'}]",Released,1986-11-11,en,0.345596,[],[],3.0,4.0,1660572699762
37232,tt0378906,845,Travellers and Magicians,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}]",Released,2003-09-09,en,0.635804,"[{'name': 'Prayer Flag Pictures', 'id': 72591}]","[{'iso_3166_1': 'BT', 'name': 'Bhutan'}, {'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'AU', 'name': 'Australia'}]",9.0,2.0,1660572699762
37232,tt0378906,845,Travellers and Magicians,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}]",Released,2003-09-09,en,0.635804,"[{'name': 'Prayer Flag Pictures', 'id': 72591}]","[{'iso_3166_1': 'BT', 'name': 'Bhutan'}, {'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'AU', 'name': 'Australia'}]",9.0,4.0,1660572699762
37232,tt0378906,845,Travellers and Magicians,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}]",Released,2003-09-09,en,0.635804,"[{'name': 'Prayer Flag Pictures', 'id': 72591}]","[{'iso_3166_1': 'BT', 'name': 'Bhutan'}, {'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'AU', 'name': 'Australia'}]",9.0,1.0,1660572699762
11816,tt0082846,470,On Golden Pond,"[{'id': 18, 'name': 'Drama'}]",Released,1981-12-04,en,12.252186,"[{'name': 'Universal Pictures', 'id': 33}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",101.0,2.0,1660572699762
11816,tt0082846,470,On Golden Pond,"[{'id': 18, 'name': 'Drama'}]",Released,1981-12-04,en,12.252186,"[{'name': 'Universal Pictures', 'id': 33}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",101.0,4.0,1660572699762
35896,tt0041172,245,Beyond the Forest,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name': 'Thriller'}]",Released,1949-10-21,en,0.913724,"[{'name': 'Warner Bros.', 'id': 6194}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",9.0,3.0,1660572699762
14785,tt0109206,178,Bandit Queen,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}]",Released,1994-09-09,en,0.976798,"[{'name': 'Channel Four Films', 'id': 181}, {'name': 'Kaleidoscope Entertainment', 'id': 2141}, {'name': 'Kaleidoscope Productions', 'id': 38210}]","[{'iso_3166_1': 'IN', 'name': 'India'}, {'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",11.0,4.0,1660572699762
14785,tt0109206,178,Bandit Queen,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'name': 'Foreign'}]",Released,1994-09-09,en,0.976798,"[{'name': 'Channel Four Films', 'id': 181}, {'name': 'Kaleidoscope Entertainment', 'id': 2141}, {'name': 'Kaleidoscope Productions', 'id': 38210}]","[{'iso_3166_1': 'IN', 'name': 'India'}, {'iso_3166_1': 'GB', 'name': 'United Kingdom'}]",11.0,4.0,1660572699762
431435,tt5778384,892,Blockbuster,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}]",Released,2017-07-13,ru,0.354543,"[{'name': 'Hype Film', 'id': 71156}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",3.0,4.0,1660572699762


In [0]:
# encapsulate query inside of pyspark
# using the spark.sql function that is available for all languages
# same engine behind the scenes there is no performance implications
df_movies_dataset = spark.sql("""
SELECT mo.id,
       mo.imdb_id,
       mo.user_id,
       mo.title,
       mo.genres,
       mo.status,
       mo.release_date,
       mo.original_language,
       mo.popularity,
       mo.production_companies,
       mo.production_countries,
       mo.vote_count,
       rt.rating,
       mo.dt_current_timestamp AS dt_current_timestamp 
FROM vw_movies AS mo
INNER JOIN vw_ratings AS rt
ON mo.user_id = rt.user_id
""")

In [0]:
display(df_movies_dataset)

id,imdb_id,user_id,title,genres,status,release_date,original_language,popularity,production_companies,production_countries,vote_count,rating,dt_current_timestamp
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.5,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,3.5,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,5.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,3.5,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026
209103,tt2444092,53,The Jungle,"[{'id': 53, 'name': 'Thriller'}]",Released,2013-06-30,en,1.164692,[],"[{'iso_3166_1': 'AU', 'name': 'Australia'}]",8.0,4.0,1648567988026



<br>
  
<img width="400px" src ='https://brzluanmoreno.blob.core.windows.net/stgfiles/png_files/dl_pq.png'>
  
<br>

In [0]:
# write into parquet
# processing zone
# using overwrite 
df_movies_dataset.write.mode("overwrite").parquet("/mnt/processing/parquet/batch/movies")


<br>
  
<img width="900px" src ='https://brzluanmoreno.blob.core.windows.net/stgfiles/png_files/dl_delta_bronze0.png'>
  
<br>


In [0]:
# write into delta
# delta architecture zone
# using overwrite
df_movies_dataset.write.mode("overwrite").format("delta").save("dbfs:/mnt/owshq/delta/batch/bronze/movies")

In [0]:
# 696
# 864
# 1.904
# 2.996
# 3.331
# 102.820
df_movies_dataset.count()