In [4]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.io.PrintWriter
import scala.util.Random


val spark = SparkSession.builder()
  .appName("Handling Incomplete Metadata")
  .getOrCreate()

spark = org.apache.spark.sql.SparkSession@32c5af34


org.apache.spark.sql.SparkSession@32c5af34

In [6]:

val moviesPath = "gs://spark_learning_1/notebooks/movies.csv"
val metadataJsonPath = "gs://spark_learning_1/notebooks/generated_metadata.json"
val moviesDF = spark.read.option("header", "true").csv(moviesPath)


val extractYear = udf((title: String) => {
  val yearPattern = "\\((\\d{4})\\)".r
  yearPattern.findFirstMatchIn(title).map(_.group(1)).getOrElse {
    (1980 + Random.nextInt(2023 - 1980 + 1)).toString
  }
})

// Generate metadata DataFrame
val metadataDF = moviesDF
  .select("movieId", "title")
  .withColumn("releaseYear", extractYear(col("title")))

// Step 4: Write the DataFrame as a single JSON file to GCS
val outputPath = "gs://spark_learning_1/notebooks/movie_lens_data/metadata.json"

metadataDF.coalesce(1) // Ensures a single output file
  .write
  .mode("overwrite")
  .json(outputPath)

println(s"Metadata written successfully to $outputPath!")


Metadata written successfully to gs://spark_learning_1/notebooks/movie_lens_data/metadata.json!


lastException = null
moviesPath = gs://spark_learning_1/notebooks/movies.csv
metadataJsonPath = gs://spark_learning_1/notebooks/generated_metadata.json
moviesDF = [movieId: string, title: string ... 1 more field]
extractYear = SparkUserDefinedFunction($Lambda$5422/0x0000000801f44840@75c75fb7,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
metadataDF = [movieId: string, title: string ... 1 more field]
outputPath = gs://spark_learning_1/notebooks/movie_lens_data/metadata.json


gs://spark_learning_1/notebooks/movie_lens_data/metadata.json

In [7]:
val metadataRDD = spark.sparkContext.textFile("gs://spark_learning_1/notebooks/movie_lens_data/metadata.json")
val parsedMetadataRDD = metadataRDD.map { line =>
  implicit val formats = DefaultFormats
  val json = parse(line)
  val movieId = (json \ "movieId").extract[String]
  val releaseYear = (json \ "releaseYear").extract[String]
  (movieId, releaseYear)
}


val metadataFromJsonDF = parsedMetadataRDD.toDF("movieId", "releaseYear")


val missingYearCount = moviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movie(s) are missing years in their titles. Please ensure that all movie titles include the release year.")
} else {
  println("Validation passed: All movies have valid release years in their titles.")
}


Validation failed: 797 movie(s) are missing years in their titles. Please ensure that all movie titles include the release year.


metadataRDD = gs://spark_learning_1/notebooks/movie_lens_data/metadata.json MapPartitionsRDD[25] at textFile at <console>:54
parsedMetadataRDD = MapPartitionsRDD[26] at map at <console>:55
metadataFromJsonDF = [movieId: string, releaseYear: string]
missingYearCount = 797


797

In [8]:
// Join metadata with moviesDF
val enrichedMoviesDF = moviesDF.join(metadataFromJsonDF, Seq("movieId"), "left").map(row => {
  val movieId = row.getString(row.fieldIndex("movieId"))
  var title = row.getString(row.fieldIndex("title"))
  val releaseYear = row.getString(row.fieldIndex("releaseYear"))

  // Append releaseYear to title if it's missing
  if (!title.matches(".*\\(\\d{4}\\)$")) {
    title = s"$title ($releaseYear)"
  }

  (movieId, title, row.getString(row.fieldIndex("genres")))
}).toDF("movieId", "title", "genres")

// Step 5: Save final DataFrame to HDFS in Parquet format
val outputPath = "hdfs:///user/casestudies/casestudy3/enriched-movies"
enrichedMoviesDF.write.mode("overwrite").parquet(outputPath)

println("Enriched movies data saved successfully!")

Enriched movies data saved successfully!


enrichedMoviesDF = [movieId: string, title: string ... 1 more field]
outputPath = hdfs:///user/casestudies/casestudy3/enriched-movies


hdfs:///user/casestudies/casestudy3/enriched-movies

In [9]:
val missingYearCount = enrichedMoviesDF
  .filter(!col("title").rlike("\\(\\d{4}\\)$"))
  .count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movie(s) do not have a valid year format (YYYY) in their titles. Please verify the titles.")
} else {
  println(s"Validation passed: All ${enrichedMoviesDF.count()} movie(s) have valid years in their titles.")
}


Validation passed: All 87585 movie(s) have valid years in their titles.


missingYearCount = 0


0