# Here are some of my PySpark codes

For this notebook I am using data from movieLense
https://files.grouplens.org/datasets/movielens/

Check the data preparation section below this Notebook

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import cufflinks as cf
cf.go_offline()


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("dataframPractices").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-02-28 16:45:52,812 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Add the helper file that contains helper class
spark.sparkContext.addPyFile('hdfs:///user/hadoop/movie_helper.py')
from movie_helper import *

# Before loading data you can check if the folder movieLense100k exists on hadoop HDFS
# !hdfs dfs -ls moviesdb/
df = DataModel(spark, "movieLense100k",delimiter=",")

                                                                                

In [5]:
# check number of partitions
dfrating = df.rating.repartition(10)
print(f"number of partitions before repartitioning is {df.rating.rdd.getNumPartitions()} and after repartitioning is {dfrating.rdd.getNumPartitions()}")

number of partitions before repartitioning is 1 and after repartitioning is 10


In [8]:
df.rating.show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
+------+-------+------+---------+
only showing top 2 rows



# 

# Data preparation

In [16]:
df1 = pd.read_csv("data/movieLense100k/users.dat",sep="|",header=None, names=["age","sex","occupation","zip_code"]) 
df1.reset_index(inplace=True)
df1.rename({"index":"user_id"},axis=1,inplace=True)

In [20]:
# Convert to spark dataframe
dfs = spark.createDataFrame (df1[["user_id","sex","age","occupation","zip_code"]])
dfs.show(2)

+-------+---+---+----------+--------+
|user_id|sex|age|occupation|zip_code|
+-------+---+---+----------+--------+
|      0|  M| 24|technician|   85711|
|      1|  F| 53|     other|   94043|
+-------+---+---+----------+--------+
only showing top 2 rows



In [21]:
!hdfs dfs -ls moviesdb/movieLense100k

2022-02-27 01:47:58,834 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 22 items
-rw-r--r--   3 hadoop supergroup       6750 2022-02-23 07:18 moviesdb/movieLense100k/README
-rw-r--r--   3 hadoop supergroup        716 2022-02-23 07:18 moviesdb/movieLense100k/allbut.pl
-rw-r--r--   3 hadoop supergroup        643 2022-02-23 07:18 moviesdb/movieLense100k/mku.sh
-rw-r--r--   3 hadoop supergroup     243704 2022-02-26 17:30 moviesdb/movieLense100k/movies.dat
-rw-r--r--   3 hadoop supergroup    1979173 2022-02-26 17:54 moviesdb/movieLense100k/ratings.dat
-rw-r--r--   3 hadoop supergroup    1979173 2022-02-26 18:47 moviesdb/movieLense100k/ratings.dat1
-rw-r--r--   3 hadoop supergroup        202 2022-02-23 07:18 moviesdb/movieLense100k/u.genre
-rw-r--r--   3 hadoop supergroup         36 2022-02-23 07:18 moviesdb/movieLense100k/u.info
-rw-r--r--   3 hadoop supergroup        193 2022-02-23 07:18 moviesdb/movieL

Delete unused files

In [25]:
files_to_delete = ["moviesdb/movieLense100k/ub.base","moviesdb/movieLense100k/ua.test","moviesdb/movieLense100k/ua.base","moviesdb/movieLense100k/u5.test",
"moviesdb/movieLense100k/u5.base","moviesdb/movieLense100k/u4.test","moviesdb/movieLense100k/u4.base","moviesdb/movieLense100k/u3.test",
"moviesdb/movieLense100k/u3.base","moviesdb/movieLense100k/u2.test","moviesdb/movieLense100k/u2.base","moviesdb/movieLense100k/u1.test","moviesdb/movieLense100k/u1.base"]
sc = spark.sparkContext
# Prepare a FileSystem manager
fs = (sc._jvm.org
      .apache.hadoop
      .fs.FileSystem
      .get(sc._jsc.hadoopConfiguration())
      )
for path in files_to_delete:
      # use the FileSystem manager to remove the path
      fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True)


In [28]:
dfs.write.csv("moviesdb/movieLense100k/users")

In [29]:
!hdfs dfs -ls moviesdb/movieLense100k/users

2022-02-27 01:56:16,222 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 9 items
-rw-r--r--   3 hadoop supergroup          0 2022-02-27 01:56 moviesdb/movieLense100k/users/_SUCCESS
-rw-r--r--   3 hadoop supergroup       2768 2022-02-27 01:56 moviesdb/movieLense100k/users/part-00000-1e4ac3f0-fe67-401e-9c03-ef0b8d43b860-c000.csv
-rw-r--r--   3 hadoop supergroup       2836 2022-02-27 01:56 moviesdb/movieLense100k/users/part-00001-1e4ac3f0-fe67-401e-9c03-ef0b8d43b860-c000.csv
-rw-r--r--   3 hadoop supergroup       2832 2022-02-27 01:56 moviesdb/movieLense100k/users/part-00002-1e4ac3f0-fe67-401e-9c03-ef0b8d43b860-c000.csv
-rw-r--r--   3 hadoop supergroup       2813 2022-02-27 01:56 moviesdb/movieLense100k/users/part-00003-1e4ac3f0-fe67-401e-9c03-ef0b8d43b860-c000.csv
-rw-r--r--   3 hadoop supergroup       2819 2022-02-27 01:56 moviesdb/movieLense100k/users/part-00004-1e4ac3f0-fe67-401e-9c03-ef0b8d43b860-c

## Read from remote and write to remote

In [7]:
# Since this is just data preparation and we write data with no header to HdFS, then no need for header name here as well. 

for fil in ["ratings","movies"]:
    rating = spark.read.csv(
        f"hdfs:///user/hadoop/moviesdb/movieLense100k/{fil}.dat",inferSchema=True,sep="\t")
    rating.write.csv(f"hdfs:///user/hadoop/moviesdb/movieLense100k/{fil}", header=False,sep=",")

## Read from local and write to HDFS

In [40]:
!hdfs dfs -rm -r moviesdb/movieLense100k/movies


2022-02-28 06:09:00,087 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Deleted moviesdb/movieLense100k/movies


In [50]:
# local manipulation
dfrating = pd.read_csv("data/movieLense100k/movies.dat", sep="|", encoding="iso-8859-1",header=None)
dfrating.to_csv("data/movieLense100k/movies.dat",index=False,header=None)

In [54]:
dfrating.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,,1,Toy Story (1995),01-Jan-1995,Unnamed: 3,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0.1,0.2,1.1,...,0.6,0.7,0.8,0.9,0.1,0.11,0.12,0.13,0.14,0.15
1,0.0,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2.0,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [59]:
dfrating = dfrating.drop([0,4],axis=1)
dfrating[dfrating[0].isna()]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,,1,Toy Story (1995),01-Jan-1995,Unnamed: 3,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0.1,0.2,1.1,...,0.6,0.7,0.8,0.9,0.1,0.11,0.12,0.13,0.14,0.15


In [42]:
df_movies.write.csv(f"hdfs:///user/hadoop/moviesdb/movieLense100k/movies", header=False)


+----+-----+----------------+
|  id|title|           genre|
+----+-----+----------------+
|null|    1|Toy Story (1995)|
|   0|    2|GoldenEye (1995)|
+----+-----+----------------+
only showing top 2 rows



In [23]:
dfm = pd.read_csv("data/movieLense100k/movies.dat", sep=",", encoding="iso-8859-1",header=None)
dfm[33] = dfm.iloc[:,4:].apply(list,axis=1)
dfm = dfm[[0,1,2,33]]

# otherwise throws error
dfm[2] = pd.to_datetime(dfm[2])

In [26]:
dfm.head()

Unnamed: 0,0,1,2,33
0,1,Toy Story (1995),01-Jan-1995,"[0.0, 0.1, 0.2, 1.1, 1.2, 1.3, 0.3, 0.4, 0.5, ..."
1,2,GoldenEye (1995),01-Jan-1995,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Four Rooms (1995),01-Jan-1995,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,Get Shorty (1995),01-Jan-1995,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
4,5,Copycat (1995),01-Jan-1995,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."


In [62]:
from pyspark.sql.types import StructField, IntegerType, DoubleType, StringType, StructType, ArrayType, FloatType, DateType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("release_date", DateType(), True),
    StructField("genre", ArrayType(FloatType()), True),
])
dfmovie2 = spark.createDataFrame(dfm,schema=schema)

In [63]:
dfmovie2.show(2) 

+---+----------------+------------+--------------------+
| id|           title|release_date|               genre|
+---+----------------+------------+--------------------+
|  1|Toy Story (1995)|  1995-01-01|[0.0, 0.1, 0.2, 1...|
|  2|GoldenEye (1995)|  1995-01-01|[0.0, 1.0, 1.0, 0...|
+---+----------------+------------+--------------------+
only showing top 2 rows



In [64]:
!hdfs dfs -rmr -r moviesdb/movieLense100k/movies

2022-02-28 06:45:20,198 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
rmr: DEPRECATED: Please use '-rm -r' instead.
Deleted moviesdb/movieLense100k/movies


In [65]:
# couldn't wirte to CSV because of the genre column
dfmovie2.write.json(f"hdfs:///user/hadoop/moviesdb/movieLense100k/movies")

In [48]:
ddddd = spark.read.json("hdfs:///user/hadoop/moviesdb/movieLense100k/movies")

In [54]:
ddddd.printSchema()

root
 |-- genre: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- movieId: long (nullable = true)
 |-- name: string (nullable = true)
 |-- release_date: string (nullable = true)

