In [None]:
import sys
import yaml
import time
import argparse
import subprocess
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, LongType, StringType, ArrayType

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%reload_ext autoreload
%autoreload 2

In [None]:
def load_config(path):
    params = dict()
    with open(path, 'r') as stream:
        params = yaml.load(stream, Loader=yaml.FullLoader)
    return params

def init_spark():
    spark = (SparkSession.builder
        .appName('Elastic Search')
        .master('local')
        .config("spark.executor.memory","4G")
        .config("spark.executor.instances","2")
        .config("spark.network.timeout","500")
        .config("spark.executor.memoryOverhead", "2G")
        .getOrCreate())
    
    sc = spark.sparkContext
    print('Debug -- spark init')
    print('Debug -- version:', sc.version)   
    print('Debug -- applicaitonId:', sc.applicationId)
    print('Debug -- uiWebUrl:', sc.uiWebUrl)
    return spark

def stop_spark(spark):
    print('Debug -- spark stop')
    spark.sparkContext.stop()

def read_dataset(movies_path, ratings_path, users_path, imdb_path, douban_movies_path, **kwargs):
    ### read movies
    movies_schema = StructType([
            StructField("movie_id", LongType(), True),
            StructField("title", StringType(), True),
            StructField("genre", StringType(), True)
    ])

    movies = spark.read.csv(movies_path, sep='::',inferSchema=False, header=False, schema=movies_schema)

    ### read ratings
    ratings_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("movie_id", LongType(), True),
            StructField("rating", FloatType(), True),
            StructField("timestamp", LongType(), True)
    ])

    ratings = spark.read.csv(ratings_path, sep='::', inferSchema=False, header=False, schema=ratings_schema)
    
    ### read users
    users_schema = StructType([
            StructField("user_id", LongType(), True),
            StructField("gender", StringType(), True),
            StructField("age", IntegerType(), True),
            StructField("occupation", StringType(), True),
            StructField("zip", StringType(), True)
    ])

    users = spark.read.csv(users_path, sep='::', inferSchema=False, header=False, schema=users_schema)

    ### read imdb datasets
    imdb = spark.read.csv(imdb_path, sep=r'\t', inferSchema=False, header=True)
    imdb = imdb.withColumn('imdb_url', F.concat(F.lit("https://www.imdb.com/title/"), F.col("tconst"), F.lit("/")))

    ### read douban movie datasets
    douban_movies = spark.read.csv(douban_movies_path, sep=',',inferSchema=True, header=True)
    
    return users, movies, ratings, imdb, douban_movies


def write_parquet_dataset(movies, ratings, users, douban_movies, \
                          movies_parquet_path, ratings_parquet_path, users_parquet_path, douban_movies_parquet_path, **kwargs):
    movies.write.parquet(movies_parquet_path, mode="overwrite")
    ratings.write.parquet(ratings_parquet_path, mode="overwrite")
    users.write.parquet(users_parquet_path, mode="overwrite")
    douban_movies.write.parquet(douban_movies_parquet_path, mode="overwrite")
    return True

In [None]:
params = load_config("es_dev.yaml")
params

In [None]:
spark = init_spark()

In [None]:
users, movies, ratings, imdb, douban_movies = read_dataset(**params)

In [None]:
users.limit(5).toPandas()

In [None]:
movies.limit(5).toPandas()

In [None]:
from pyspark.sql.functions import udf

extract_genres = udf(lambda x: x.lower().split("|"), ArrayType(StringType()))
raw_movies = movies
movies = raw_movies.select("movie_id", "title", extract_genres("genre").alias("genre"))
movies.limit(10).toPandas()

In [None]:
ratings.limit(5).toPandas()

In [None]:
douban_movies.limit(5).toPandas()

In [None]:
douban_movies=douban_movies.select('MOVIE_ID', 'NAME', 'STORYLINE')

In [None]:
write_parquet_dataset(movies, ratings, users, douban_movies, **params)

In [None]:
!aws s3 ls s3://dmetasoul-bucket/demo/movielens/ml-1m/

In [None]:
!aws s3 ls s3://dmetasoul-bucket/demo/datasets/moviedata-10m/