In [2]:
import os
print("JAVA_HOME =", os.environ.get("JAVA_HOME"))

JAVA_HOME = None


#### Verified which Python my notebook is using

In [3]:
import sys
print(sys.executable)

/Users/pernebayarailym/anaconda3/envs/myenv/bin/python


#### Ensured PySpark installs in the same Python environment my notebook is using

In [4]:
import sys
!{sys.executable} -m pip install pyspark



In [5]:
import pyspark
print(pyspark.__version__)

4.0.1


##### Used these commands to install apache-spark because it couldn't find from homebrew , so I manually set the SPARK_HOME path using the path from the command below:

1) brew install apache-spark
2) brew --prefix apache-spark



In [6]:
import os
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk@17/libexec/openjdk.jdk/Contents/Home'

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp_cleaning") \
    .getOrCreate()

print(spark.version)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/29 14:50:23 WARN Utils: Your hostname, MacBook-Air--Arailym.local, resolves to a loopback address: 127.0.0.1; using 172.20.10.2 instead (on interface en0)
25/09/29 14:50:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/29 14:50:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


4.0.1


In [7]:
# Import necessary PySpark modules
from pyspark.sql import SparkSession #is the entry point to PySpark
from pyspark.sql import functions as F 
from pyspark.sql import types as T

In [8]:
# Read CSV files
csv_path = '/Users/pernebayarailym/Documents/Portfolio_Projects_AP/Simplon_DE_Projects/Python_Projects/Project_Pyspark_DBT/data/ventes.csv'
df = spark.read \
    .option("header", True) \
    .option("inferSchema", False) \
    .csv(csv_path)

df.printSchema()
df.show(5)

root
 |-- id_transaction: string (nullable = true)
 |-- client_nom: string (nullable = true)
 |-- client_age: string (nullable = true)
 |-- client_ville: string (nullable = true)
 |-- produit_nom: string (nullable = true)
 |-- produit_categorie: string (nullable = true)
 |-- produit_marque: string (nullable = true)
 |-- prix_catalogue: string (nullable = true)
 |-- magasin_nom: string (nullable = true)
 |-- magasin_type: string (nullable = true)
 |-- magasin_region: string (nullable = true)
 |-- date: string (nullable = true)
 |-- quantite: string (nullable = true)
 |-- montant_total: string (nullable = true)

+--------------+----------+----------+------------+----------------+-----------------+--------------+--------------+--------------+------------+--------------------+----------+--------+-------------+
|id_transaction|client_nom|client_age|client_ville|     produit_nom|produit_categorie|produit_marque|prix_catalogue|   magasin_nom|magasin_type|      magasin_region|      date|quanti

In [9]:
#1 task. Normalize string columns (lowercase + remove extra spaces)
#detecting which columns are string
string_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, T.StringType)]

# for each string column, convert to lowercase and trim spaces
for c in string_cols:
    df = df.withColumn(c, F.lower(F.trim(F.col(c))))
df.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------+----------+----------+------------+----------------+-----------------+--------------+--------------+--------------+------------+--------------------+----------+--------+-------------+
|id_transaction|client_nom|client_age|client_ville|     produit_nom|produit_categorie|produit_marque|prix_catalogue|   magasin_nom|magasin_type|      magasin_region|      date|quantite|montant_total|
+--------------+----------+----------+------------+----------------+-----------------+--------------+--------------+--------------+------------+--------------------+----------+--------+-------------+
|             1|     alice|        25|       paris|      ordinateur|     informatique|          dell|           800| boutique lyon|    physique|auvergne-rhône-alpes|2023-03-12|       2|         NULL|
|             2|       bob|        34|        lyon|      smartphone|       téléphonie|         apple|          1200| boutique lyon|    physique|auvergne-rhône-alpes|2023-01-27|       5|         NULL|


                                                                                

In [10]:
#Replace empty strings with None (NULL) to detect missing values
for c in df.columns:
    df = df.withColumn(c, F.when(F.col(c) == "", None).otherwise(F.col(c)))

df = df.na.drop(how="any") #drop rows with any NULL values
df.show(5)

+--------------+----------+----------+------------+-----------+-----------------+--------------+--------------+-----------+------------+--------------+----+--------+-------------+
|id_transaction|client_nom|client_age|client_ville|produit_nom|produit_categorie|produit_marque|prix_catalogue|magasin_nom|magasin_type|magasin_region|date|quantite|montant_total|
+--------------+----------+----------+------------+-----------+-----------------+--------------+--------------+-----------+------------+--------------+----+--------+-------------+
+--------------+----------+----------+------------+-----------+-----------------+--------------+--------------+-----------+------------+--------------+----+--------+-------------+

