In [0]:
def file_exists(path):
  try:
    dbutils.fs.ls(path)
    return True 
  except Exception as e:
    if 'java.io.FileNotFoundException' in str(e):
      return False
    else:
      raise

actorsUrl = "https://raw.githubusercontent.com/cegladanych/azure_bi_data/main/IMDB_movies/actors.csv"

filePath = "/FileStore/tables/Files/"
dbutils.fs.mkdirs(filePath)
actorsFile = "actors.csv"
tmp = "file:/tmp/"
dbfsdestination = "dbfs:/FileStore/tables/Files/"

#ACTORS
import urllib.request

if (file_exists(filePath + actorsFile) == False):
  urllib.request.urlretrieve(actorsUrl,"/tmp/" + actorsFile)
  dbutils.fs.mv(tmp + actorsFile,dbfsdestination + actorsFile)

In [0]:
df = spark.read.csv(dbfsdestination + actorsFile, header=True, inferSchema=True)
#display(df)

In [0]:
 from pyspark.sql.functions import col, explode, regexp_replace, regexp_extract, ifnull, array_contains, count, avg, max, lit, split, nullif

 #fill
nonull1= df.fillna({"job": "unknown"})

 # drop
nonull3=df.dropna(subset=["characters"])

 #regexp_replace
 cleared=df.withColumn("characters", regexp_replace(col("characters"), "[\[\]]",""))

 # explode
exploded= cleared.withColumn("character_list", split(col("characters"), r"\) \[|\],\s*"))
exploded=exploded.withColumn("exploded_characters", explode(col("character_list")))


 # regexp_extract 
 extracted=cleared.withColumn("main_character", regexp_extract(col("characters"),"^(.*?)( -|$)", 1))
 #ifnull
nonull2 = cleared.withColumn("job", ifnull(col("job"), lit("unknown")))
 #nullIf
with_nulls= nonull2.withColumn("job", nullif(col("job"), lit("unknown")))
 # replace
 replaced=cleared.withColumn("category", regexp_replace(col("category"), "actor", "performer"))
 # array_contains.  
writers = df.withColumn("is_writer", array_contains(split(col("category"), " "), "writer"))


In [0]:
#Użyj 3 funkcji agregujących (które według Ciebie są najciekawsze).  

aggregated= df.groupBy("category").agg(
    count("imdb_name_id").alias("total_actors"),
    avg("ordering").alias("avg_ordering"),
    max("ordering").alias("max_ordering")
)
aggregated.show()

+-------------------+------------+------------------+------------+
|           category|total_actors|      avg_ordering|max_ordering|
+-------------------+------------+------------------+------------+
|            actress|      133414|2.7871137961533274|          10|
|           producer|      101092| 7.744035136311479|          10|
|             writer|      122793| 6.809052633293429|          10|
|           composer|       66861| 8.345567670241246|          10|
|           director|       88968| 5.096585289092707|          10|
|               self|         909|3.0693069306930694|          10|
|              actor|      222337|2.7522004884477167|          10|
|             editor|       33780| 9.286234458259326|          10|
|    cinematographer|       55423|  8.80094906446782|          10|
|production_designer|        9485|   9.5222983658408|          10|
|    archive_footage|         444|3.7545045045045047|          10|
|      archive_sound|           7|               2.0|         

In [0]:


from pyspark.sql.functions import udf, col, pandas_udf
import pandas as pd
from pyspark.sql.types import IntegerType, StringType

#Stwórz 2 funkcje UDF do wybranego zestawu danych, wymyśl co mają robić w #kontekście wybranych danych. Jedna funkcja z dekoratorem @pandas_udf druga #standardowa. 

#Jedna funkcja działające na typach liczbowych: int, double  
def count_length(characters):
    if characters is None:
        return 0
    return sum(len(c) for c in characters)


count_length_udf = udf(count_length, IntegerType())

df = df.withColumn("characters_length", count_length_udf(col("characters")))


#Jedna funkcja na string 
@pandas_udf(StringType())
def uppercase_category_udf(category_series: pd.Series) -> pd.Series:
    return category_series.str.upper()
    
df = df.withColumn("category_uppercase", uppercase_category_udf(col("category")))
