In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
import os
import sys

os.environ["JAVA_HOME"] = r"C:\Program Files\Zulu\zulu-8"
#os.environ["SPARK_HOME"] = r"C:\spark"
os.environ["PYSPARK_PYTHON"] = sys.executable


In [3]:
spark = (
    SparkSession.builder
    .master('local[1]')
    .appName('PySpark_01')
    .config("spark.driver.memory", "2g")
    .config("spark.sql.shuffle.partitions", "1")
    .getOrCreate()
)

In [4]:
df = spark.read.csv('amazon.csv', header=True, inferSchema=True)

In [7]:
df.show(5)

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|        about_product|             user_id|           user_name|           review_id|        review_title|      review_content|            img_link|        product_link|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|Computers&Accesso...|            ₹399|      ₹1,099|                64%|   4.2|      2

In [12]:
df.printSchema()


root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- discounted_price: string (nullable = true)
 |-- actual_price: string (nullable = true)
 |-- discount_percentage: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- rating_count: string (nullable = true)
 |-- about_product: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_content: string (nullable = true)
 |-- img_link: string (nullable = true)
 |-- product_link: string (nullable = true)



In [None]:
df.toPandas().isna().sum() #Obs: It works just for small datasets

product_id             0
product_name           0
category               0
discounted_price       0
actual_price           0
discount_percentage    0
rating                 0
rating_count           2
about_product          0
user_id                0
user_name              0
review_id              0
review_title           0
review_content         0
img_link               0
product_link           0
dtype: int64

In [16]:
# Null values counting for large datasets

for coluna in df.columns:
    print(coluna, df.filter(df[coluna].isNull()).count())

product_id 0
product_name 0
category 0
discounted_price 0
actual_price 0
discount_percentage 0
rating 0
rating_count 2
about_product 0
user_id 0
user_name 0
review_id 0
review_title 0
review_content 0
image_link 0
product_link 0


In [15]:
df = df.withColumnRenamed('img_link', 'image_link')

In [None]:
# The "\" indicates that the 'enter' will be ignored

df = df.withColumnRenamed('product_id', 'product_ID')\
.withColumnRenamed('user_id', 'user_ID')\
.withColumnRenamed('review_id', 'review_ID')

In [18]:
# Selecting columns

df.select('product_name', 'category', 'user_name').show(10)

+--------------------+--------------------+--------------------+
|        product_name|            category|           user_name|
+--------------------+--------------------+--------------------+
|Wayona Nylon Brai...|Computers&Accesso...|Manav,Adarsh gupt...|
|Ambrane Unbreakab...|Computers&Accesso...|ArdKn,Nirbhay kum...|
|Sounce Fast Phone...|Computers&Accesso...|Kunal,Himanshu,vi...|
|boAt Deuce USB 30...|Computers&Accesso...|Omkar dhale,JD,HE...|
|Portronics Konnec...|Computers&Accesso...|rahuls6099,Swasat...|
|pTron Solero TB30...|Computers&Accesso...|Jayesh,Rajesh k.,...|
|boAt Micro USB 55...|Computers&Accesso...|Vivek kumar,Amazo...|
|MI Usb Type-C Cab...|Computers&Accesso...|Pavan A H,Jayesh ...|
|TP-Link USB WiFi ...|Computers&Accesso...|Azhar JuMan,Aniru...|
|Ambrane Unbreakab...|Computers&Accesso...|ArdKn,Nirbhay kum...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [22]:
df.select(col('product_name').alias('Product Name'), col('category').alias('Category Name'), col('user_name').alias('User Name')).show(10)

+--------------------+--------------------+--------------------+
|        Product Name|       Category Name|           User Name|
+--------------------+--------------------+--------------------+
|Wayona Nylon Brai...|Computers&Accesso...|Manav,Adarsh gupt...|
|Ambrane Unbreakab...|Computers&Accesso...|ArdKn,Nirbhay kum...|
|Sounce Fast Phone...|Computers&Accesso...|Kunal,Himanshu,vi...|
|boAt Deuce USB 30...|Computers&Accesso...|Omkar dhale,JD,HE...|
|Portronics Konnec...|Computers&Accesso...|rahuls6099,Swasat...|
|pTron Solero TB30...|Computers&Accesso...|Jayesh,Rajesh k.,...|
|boAt Micro USB 55...|Computers&Accesso...|Vivek kumar,Amazo...|
|MI Usb Type-C Cab...|Computers&Accesso...|Pavan A H,Jayesh ...|
|TP-Link USB WiFi ...|Computers&Accesso...|Azhar JuMan,Aniru...|
|Ambrane Unbreakab...|Computers&Accesso...|ArdKn,Nirbhay kum...|
+--------------------+--------------------+--------------------+
only showing top 10 rows



In [23]:
df.select(df['product_name'].alias('Name of the product')).show()

+--------------------+
| Name of the product|
+--------------------+
|Wayona Nylon Brai...|
|Ambrane Unbreakab...|
|Sounce Fast Phone...|
|boAt Deuce USB 30...|
|Portronics Konnec...|
|pTron Solero TB30...|
|boAt Micro USB 55...|
|MI Usb Type-C Cab...|
|TP-Link USB WiFi ...|
|Ambrane Unbreakab...|
|Portronics Konnec...|
|boAt Rugged v3 Ex...|
|AmazonBasics Flex...|
|Portronics Konnec...|
|Portronics Konnec...|
|MI Braided USB Ty...|
|MI 80 cm (32 inch...|
|Ambrane Unbreakab...|
|boAt Type C A325 ...|
|LG 80 cm (32 inch...|
+--------------------+
only showing top 20 rows



In [30]:
# Filtering data

df.filter('rating = 4.2').show(5)

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|       about_product|             user_id|           user_name|           review_id|        review_title|      review_content|          image_link|        product_link|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|Computers&Accesso...|            ₹399|      ₹1,099|                64%|   4.2|      24,2

In [32]:
df.filter((col('rating') == 4.2) | (col('rating') == 4.3)).show(5)

+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|product_id|        product_name|            category|discounted_price|actual_price|discount_percentage|rating|rating_count|       about_product|             user_id|           user_name|           review_id|        review_title|      review_content|          image_link|        product_link|
+----------+--------------------+--------------------+----------------+------------+-------------------+------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|B07JW9H4J1|Wayona Nylon Brai...|Computers&Accesso...|            ₹399|      ₹1,099|                64%|   4.2|      24,2