### What is Array Zip

![](Images/36/36 Array Zip.jpg)

### Example

![](Images/36/36 Example.jpg)

### Use Case to Flatten Data

![](Images/36/36 Use Case.jpg)

### Create Sample Dataframe

In [0]:
array_data = [
  ("John", 4, 1),
  ("John", 6, 2),
  ("David", 7, 3),
  ("Mike", 3, 4),
  ("David", 5, 2),
  ("John", 7, 3),
  ("John", 9, 7),
  ("David", 1, 8),
  ("David", 4, 9),
  ("David", 7, 4),
  ("Mike", 8, 5),
  ("Mike", 5, 2),
  ("Mike", 3, 8),
  ("John", 2, 7),
  ("David", 1, 9),
]

array_schema = ["Name", "Score_1", "Score_2"]
arrayDF = spark.createDataFrame(data = array_data, schema = array_schema)
display(arrayDF)

### Convert Sample Dataframe to Array Dataframe

In [0]:
from pyspark.sql import functions as F

masterDF = arrayDF.groupBy("Name").agg(F.collect_list("Score_1").alias('Array_Score_1'), F.collect_list("Score_2").alias('Array_Score_2'))
display(masterDF)
masterDF.printSchema()

### Apply arrays_zip fucntion on Array DF

In [0]:
arr_zip_df = masterDF.withColumn("Zipperd_Value", F.arrays_zip("Array_Score_1", "Array_Score_2"))
display(arr_zip_df)
arr_zip_df.show(10, False)

### Practical Use Case to Flatten Data using arrays_zip and explode

### Create Sample Dataframe

In [0]:
empDF = [
    ('Sales_Dept', [
        {'Emp_Name': 'John', 'Salary': '1000', 'Years_of_Service': '10', 'Age': '33'},
        {'Emp_Name': 'David', 'Salary': '2000', 'Years_of_Service': '15', 'Age': '40'},
        {'Emp_Name': 'Nancy', 'Salary': '8000', 'Years_of_Service': '20', 'Age': '45'},
        {'Emp_Name': 'Mike', 'Salary': '3000', 'Years_of_Service': '6', 'Age': '30'},
        {'Emp_Name': 'Rosy', 'Salary': '6000', 'Years_of_Service': '8', 'Age': '32'}
    ]),
    ('HR_Dept', [
        {'Emp_Name': 'Edwin', 'Salary': '6000', 'Years_of_Service': '8', 'Age': '31'},
        {'Emp_Name': 'Thomas', 'Salary': '3000', 'Years_of_Service': '4', 'Age': '26'},
        {'Emp_Name': 'Sarah', 'Salary': '12000', 'Years_of_Service': '22', 'Age': '49'},
        {'Emp_Name': 'Stella', 'Salary': '15000', 'Years_of_Service': '25', 'Age': '52'},
        {'Emp_Name': 'Kevin', 'Salary': '4000', 'Years_of_Service': '5', 'Age': '27'}
    ])
]

df_brand = spark.createDataFrame(data = empDF, schema = ['Department', 'Employee'])
df_brand.printSchema()
display(df_brand)

### Apply arrays_zip

In [0]:
df_brand_zip = df_brand.withColumn("Zip", F.arrays_zip(df_brand['Employee']))
display(df_brand_zip)

### Apply explode

In [0]:
df_brand_exp = df_brand_zip.withColumn("Explode", F.explode(df_brand_zip.Zip))
display(df_brand_exp)

### Flatten Fields from Exploded List


In [0]:
df_brand_output = df_brand_exp.withColumn('Employee_Name', df_brand_exp['Explode.Employee.Emp_Name']) \
    .withColumn('Emp_Years_of_Service', df_brand_exp['Explode.Employee.Years_of_Service']) \
    .withColumn('Emp_Salary', df_brand_exp['Explode.Employee.Salary']) \
    .withColumn('Emp_Age', df_brand_exp['Explode.Employee.Age']) \
    .drop('Explode') \
    .drop('Zip') \
    .drop('Employee')
    
display(df_brand_output)