In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install findspark

In [None]:
!pip install pyspark

In [15]:
import findspark
from pyspark.sql.functions import *
# findspark.init()
from pyspark.sql import SparkSession
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()
# spark = SparkSession.builder.appName("app_name").getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
df = spark.read.format('csv').option('header', True)\
          .option('inferSchema', True)\
          .option('sep', ',')\
          .load('/content/drive/MyDrive/DATA_ENGINEER/DATABRICKS/AAAA_________IAM___CODING_____Colab Notebooks/Learning/datasset/AFCON/Africa Cup of Nations Players.csv')

df.printSchema()
df.show(10)


root
 |-- _c0: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- ShirtNumber: string (nullable = true)
 |-- PlayerPosition: string (nullable = true)
 |-- PlayerName: string (nullable = true)
 |-- DateofBirth(age): string (nullable = true)
 |-- Caps: string (nullable = true)
 |-- Goals: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Country: string (nullable = true)

+---+----+-----------+--------------+--------------------+--------------------+----+-----+---------+-------+
|_c0|Year|ShirtNumber|PlayerPosition|          PlayerName|    DateofBirth(age)|Caps|Goals|     Club|Country|
+---+----+-----------+--------------+--------------------+--------------------+----+-----+---------+-------+
|  0|1957|       NULL|            GK|            Ali Bakr|                NULL|NULL| NULL|  Zamalek|  Egypt|
|  1|1957|       NULL|            GK|"Paraskos ""Brasc...|                NULL|NULL| NULL| El-Qanah|  Egypt|
|  2|1957|       NULL|            GK| Abdel-Galil

## Remove space in columns and rows

In [None]:
# Create a list of tuples where the first element is the old column name and the second element is the new column name
rename_columns = [(c, c.strip()) for c in df.columns]

# Use the withColumnRenamed method with a dictionary to rename multiple columns

for current_name, new_name in rename_columns:
    df = df.withColumnRenamed(current_name, new_name)
    df = df.withColumn(new_name, trim(df[new_name]))


# Print the schema of the renamed DataFrame
df.printSchema()
df.show(10)

## Create df with array column

In [9]:
array_appliance = [
    ('Loic', ['TV', 'Refigerator', 'Oven', 'AC']),
    ('Andre', ['Refigerator', 'Washing Machine']),
    ('Assogba', ['TV', 'Refigerator', 'Computer']),
    ('XXXXXX', None),
]

df_app = spark.createDataFrame(data=array_appliance, schema=['name', 'Appliances'])

df_app.printSchema()

df_app.show()

root
 |-- name: string (nullable = true)
 |-- Appliances: array (nullable = true)
 |    |-- element: string (containsNull = true)

+-------+--------------------+
|   name|          Appliances|
+-------+--------------------+
|   Loic|[TV, Refigerator,...|
|  Andre|[Refigerator, Was...|
|Assogba|[TV, Refigerator,...|
| XXXXXX|                NULL|
+-------+--------------------+



## Create df with array map

In [17]:
import json
data_json = [
    {
        "Order ID": "GJVSZXTB5M1685168956",
        "Customer Name": "Bertrand Le Renault",
        "Mobile Model": "Apple/iPhone 11 Pro Max/Silver/4GB/512 GB",
        "Quantity": 2,
        "Price per Unit": "1527",
        "Total Price": 3054,
        "Promotion Code": None,
        "Order Amount": 3054,
        "Tax": 610.8000000000001,
        "Order Date": "2020-01-02",
        "Payment Status": "Paid",
        "Shipping Status": "Returned",
        "Payment Method": "Debit Card",
        "Payment Provider": "Master",
        "Phone": "03 25 40 48 49",
        "Delivery Address": "avenue Laetitia Bouvet\n86226 Blanchet"
    },
    {
        "Order ID": "Z141RRR17K1685168956",
        "Customer Name": "St\u00e9phane-Ren\u00e9 Roy",
        "Mobile Model": "Motorola/G6 Play/Fine Gold/3 GB/32 GB",
        "Quantity": 1,
        "Price per Unit": "132",
        "Total Price": 132,
        "Promotion Code": { "fafafafafafa":
            {'TV': 'LG', 'Refigerator': 'Samsung', 'Oven': 'Philipps', 'AC': {'TV': 'LG', 'Refigerator': 'Samsung', 'Oven': 'Philipps', 'AC': 'Voltas'}}
        },
        "Order Amount": 112.2,
        "Tax": 22.44,
        "Order Date": "2020-01-02",
        "Payment Status": "Pending",
        "Shipping Status": "Delivered",
        "Payment Method": "Credit Card",
        "Payment Provider": "American Express",
        "Phone": "04 87 43 61 98",
        "Delivery Address": "73, rue Marc Voisin\n50361 LebonVille"
    },]
map_brand = [
    ('EEEE', None),
    ('Loic', {'TV': 'LG', 'Refigerator': 'Samsung', 'Oven': 'Philipps', 'AC': 'Voltas'}),
    ('Andre', {'Refigerator': 'Ikea', 'Washing Machine': 'Philipps',}),
    ('Assogba', {'TV': 'Samsung', 'Refigerator': 'Sony', 'Computer': 'HP'}),
    ('David', data_json[0]),
    ('USERRR', data_json[1]),
]

df_brand = spark.createDataFrame(data=map_brand, schema=['name', 'Brand'])

df_brand.printSchema()

df_brand.show()




root
 |-- name: string (nullable = true)
 |-- Brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-------+--------------------+
|   name|               Brand|
+-------+--------------------+
|   EEEE|                NULL|
|   Loic|{AC -> Voltas, TV...|
|  Andre|{Refigerator -> I...|
|Assogba|{Computer -> HP, ...|
|  David|{Total Price -> 3...|
| USERRR|{Total Price -> 1...|
+-------+--------------------+



*Explode provides us to don't show null value in column*

*Explode outer provides us to show null value in column*

## Explode Array

In [20]:
df_app.select('*', explode(df_app.Appliances)).show()
df_app.select(df_app.name, explode(df_app.Appliances)).show()

+-------+--------------------+---------------+
|   name|          Appliances|            col|
+-------+--------------------+---------------+
|   Loic|[TV, Refigerator,...|             TV|
|   Loic|[TV, Refigerator,...|    Refigerator|
|   Loic|[TV, Refigerator,...|           Oven|
|   Loic|[TV, Refigerator,...|             AC|
|  Andre|[Refigerator, Was...|    Refigerator|
|  Andre|[Refigerator, Was...|Washing Machine|
|Assogba|[TV, Refigerator,...|             TV|
|Assogba|[TV, Refigerator,...|    Refigerator|
|Assogba|[TV, Refigerator,...|       Computer|
+-------+--------------------+---------------+

+-------+---------------+
|   name|            col|
+-------+---------------+
|   Loic|             TV|
|   Loic|    Refigerator|
|   Loic|           Oven|
|   Loic|             AC|
|  Andre|    Refigerator|
|  Andre|Washing Machine|
|Assogba|             TV|
|Assogba|    Refigerator|
|Assogba|       Computer|
+-------+---------------+



## Explode Map

In [14]:
dfexplode_map = df_brand.select(df_brand.name, explode(df_brand.Brand)).show(200)

+-------+----------------+--------------------+
|   name|             key|               value|
+-------+----------------+--------------------+
|   Loic|              AC|              Voltas|
|   Loic|              TV|                  LG|
|   Loic|            Oven|            Philipps|
|   Loic|     Refigerator|             Samsung|
|  Andre|     Refigerator|                Ikea|
|  Andre| Washing Machine|            Philipps|
|Assogba|        Computer|                  HP|
|Assogba|              TV|             Samsung|
|Assogba|     Refigerator|                Sony|
|  David|     Total Price|                3054|
|  David|  Payment Status|                Paid|
|  David|        Order ID|GJVSZXTB5M1685168956|
|  David|        Quantity|                   2|
|  David|             Tax|   610.8000000000001|
|  David|    Order Amount|                3054|
|  David|    Mobile Model|Apple/iPhone 11 P...|
|  David|Payment Provider|              Master|
|  David|  Price per Unit|              

## Position Outer to get Null Value   

In [18]:
df_brand.select(df_brand.name, explode_outer(df_brand.Brand)).show()

+-------+----------------+--------------------+
|   name|             key|               value|
+-------+----------------+--------------------+
|   EEEE|            NULL|                NULL|
|   Loic|              AC|              Voltas|
|   Loic|              TV|                  LG|
|   Loic|            Oven|            Philipps|
|   Loic|     Refigerator|             Samsung|
|  Andre|     Refigerator|                Ikea|
|  Andre| Washing Machine|            Philipps|
|Assogba|        Computer|                  HP|
|Assogba|              TV|             Samsung|
|Assogba|     Refigerator|                Sony|
|  David|     Total Price|                3054|
|  David|  Payment Status|                Paid|
|  David|        Order ID|GJVSZXTB5M1685168956|
|  David|        Quantity|                   2|
|  David|             Tax|   610.8000000000001|
|  David|    Order Amount|                3054|
|  David|    Mobile Model|Apple/iPhone 11 P...|
|  David|Payment Provider|              

## Position Explodee

In [19]:
df_brand.select(df_brand.name, explode_outer(df_brand.Brand)).show()

+-------+--------------------+----------------+--------------------+
|   name|               Brand|             key|               value|
+-------+--------------------+----------------+--------------------+
|   EEEE|                NULL|            NULL|                NULL|
|   Loic|{AC -> Voltas, TV...|              AC|              Voltas|
|   Loic|{AC -> Voltas, TV...|              TV|                  LG|
|   Loic|{AC -> Voltas, TV...|            Oven|            Philipps|
|   Loic|{AC -> Voltas, TV...|     Refigerator|             Samsung|
|  Andre|{Refigerator -> I...|     Refigerator|                Ikea|
|  Andre|{Refigerator -> I...| Washing Machine|            Philipps|
|Assogba|{Computer -> HP, ...|        Computer|                  HP|
|Assogba|{Computer -> HP, ...|              TV|             Samsung|
|Assogba|{Computer -> HP, ...|     Refigerator|                Sony|
|  David|{Total Price -> 3...|     Total Price|                3054|
|  David|{Total Price -> 3...|  Pa

In [26]:
# array
df_app.select('*', posexplode_outer(df_app.Appliances)).show()

# map
df_brand.select('*', posexplode_outer(df_brand.Brand)).show(200)

+-------+--------------------+----+---------------+
|   name|          Appliances| pos|            col|
+-------+--------------------+----+---------------+
|   Loic|[TV, Refigerator,...|   0|             TV|
|   Loic|[TV, Refigerator,...|   1|    Refigerator|
|   Loic|[TV, Refigerator,...|   2|           Oven|
|   Loic|[TV, Refigerator,...|   3|             AC|
|  Andre|[Refigerator, Was...|   0|    Refigerator|
|  Andre|[Refigerator, Was...|   1|Washing Machine|
|Assogba|[TV, Refigerator,...|   0|             TV|
|Assogba|[TV, Refigerator,...|   1|    Refigerator|
|Assogba|[TV, Refigerator,...|   2|       Computer|
| XXXXXX|                NULL|NULL|           NULL|
+-------+--------------------+----+---------------+

+-------+--------------------+----+----------------+--------------------+
|   name|               Brand| pos|             key|               value|
+-------+--------------------+----+----------------+--------------------+
|   EEEE|                NULL|NULL|            NU