In [1]:
import findspark

findspark.init()
findspark.find()


'C:\\Program Files\\spark\\spark-3.4.1-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType

spark = SparkSession.builder.master("local[*]").appName("MoviesRating").getOrCreate()
spark


In [3]:
ratings = spark.sparkContext.textFile("./movie_data/ratings.dat")

In [4]:
fields = ["UserID", "MovieID", "Rating", "Timestamp"]
schema = StructType([StructField(field, StringType(), True) for field in fields])


In [5]:
ratings_rdd = ratings.map(lambda x: x.split("::")).map(
    lambda x: (x[0], x[1], x[2], x[3])
)


In [6]:
ratings_df = spark.createDataFrame(data=ratings_rdd, schema=schema)
ratings_df.show()


+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
|     1|   1197|     3|978302268|
|     1|   1287|     5|978302039|
|     1|   2804|     5|978300719|
|     1|    594|     4|978302268|
|     1|    919|     4|978301368|
|     1|    595|     5|978824268|
|     1|    938|     4|978301752|
|     1|   2398|     4|978302281|
|     1|   2918|     4|978302124|
|     1|   1035|     5|978301753|
|     1|   2791|     4|978302188|
|     1|   2687|     3|978824268|
|     1|   2018|     4|978301777|
|     1|   3105|     5|978301713|
|     1|   2797|     4|978302039|
+------+-------+------+---------+
only showing top 20 rows



In [7]:
usersRdd = spark.sparkContext.textFile("./movie_data/users.dat")

In [8]:
fields = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
schema = StructType([StructField(field, StringType(), True) for field in fields])


In [9]:
# Create DataFrame
row_rdd = usersRdd.map(lambda x: x.split("::")).map(
    lambda x: (x[0], x[1], x[2], x[3], x[4])
)
users = spark.createDataFrame(data=row_rdd, schema=schema)
users.show()


+------+------+---+----------+--------+
|UserID|Gender|Age|Occupation|Zip-code|
+------+------+---+----------+--------+
|     1|     F|  1|        10|   48067|
|     2|     M| 56|        16|   70072|
|     3|     M| 25|        15|   55117|
|     4|     M| 45|         7|   02460|
|     5|     M| 25|        20|   55455|
|     6|     F| 50|         9|   55117|
|     7|     M| 35|         1|   06810|
|     8|     M| 25|        12|   11413|
|     9|     M| 25|        17|   61614|
|    10|     F| 35|         1|   95370|
|    11|     F| 25|         1|   04093|
|    12|     M| 25|        12|   32793|
|    13|     M| 45|         1|   93304|
|    14|     M| 35|         0|   60126|
|    15|     M| 25|         7|   22903|
|    16|     F| 35|         0|   20670|
|    17|     M| 50|         1|   95350|
|    18|     F| 18|         3|   95825|
|    19|     M|  1|        10|   48073|
|    20|     M| 25|        14|   55113|
+------+------+---+----------+--------+
only showing top 20 rows



In [10]:
# load dataset as dataframe

movies = spark.read.text("./movie_data/movies.dat")
movies.show(truncate=False)


+--------------------------------------------------------+
|value                                                   |
+--------------------------------------------------------+
|1::Toy Story (1995)::Animation|Children's|Comedy        |
|2::Jumanji (1995)::Adventure|Children's|Fantasy         |
|3::Grumpier Old Men (1995)::Comedy|Romance              |
|4::Waiting to Exhale (1995)::Comedy|Drama               |
|5::Father of the Bride Part II (1995)::Comedy           |
|6::Heat (1995)::Action|Crime|Thriller                   |
|7::Sabrina (1995)::Comedy|Romance                       |
|8::Tom and Huck (1995)::Adventure|Children's            |
|9::Sudden Death (1995)::Action                          |
|10::GoldenEye (1995)::Action|Adventure|Thriller         |
|11::American President, The (1995)::Comedy|Drama|Romance|
|12::Dracula: Dead and Loving It (1995)::Comedy|Horror   |
|13::Balto (1995)::Animation|Children's                  |
|14::Nixon (1995)::Drama                                

In [11]:
# Split the columns using '::'
from pyspark.sql import functions as f

fields = movies.withColumn("split_data", f.split(f.col("value"), "::"))


In [12]:
# Extract the MovieID, Title, and Genres (Year) columns
extracted_df = fields.select(
    f.col("split_data")[0].alias("MovieID"),
    f.col("split_data")[1].alias("Title"),
    f.col("split_data")[2].alias("Genres"),
)


In [13]:
years_and_genres_df =(
    extracted_df.withColumn('Year', f.split(f.col('Title')))
)

TypeError: split() missing 1 required positional argument: 'pattern'

In [None]:
# spark.stop()