In [0]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

spark = SparkSession.builder.master("local[*]") \
                    .appName('Data Modelling on AWS Databricks with Spark') \
                    .getOrCreate()

employee_data2 = [("Emmanuel", "Oyekanlu", "6111876", "M", 33,  237000,  "manuelbomi@yahoo.com", 11, "640-137-0000"),
                  ("Don", "Coder", "387654", "M", 30,  210000,  "python-Coder@gmail.com", 12, "540-543-3245"), 
                  ("Henry", "Charles", "3000127", "M", 42,  210000,  "massie@yahoo.com", 7, "127-196-5676"),
                  ("Stephen", "Smith", "9087655", "M", 38,  156000,  "miutss@karen.com", 5, "234-112-9812"),
                  ("Rose", "CarlyWiggle", "5609876", "F", 24,  237000,  "iutyrr@yahoo.com", 18, "876-137-0119"),
                  ("Diddier", "Thomas", "6347652", "M", 53,  237000,  "potyur@yahoo.com", 19, "653-239-9876"),
                  ("Carla", "Fisher", "9871234", "F", 28,  121000,  "tuyinmg@yahoo.com", 3, "555-666-9876"),
                  ("Yinka", "Eromonsele", "547863", "F", 29,  99500,  "eromonsele@yahoo.com", 12, "652-653-0987"),
                  ("Rod", "BiggerStewart", "698328", "M", 54,  76500,  "BiggerS@yahoo.com", 12, "640-137-0000"),
                  ("Oliver", "Twist", "7652423", "M", 33,  200000, "Twister@yahoo.com", 7, "764-129-9009"),
                  ("Moses", "Aaron", "9876543", "M", 23,  186000,  "Moses@cnn.com", 6, "456-987-2324"),
                  ("Molly", "Van Modeller", "6487653", "F", 39,  232000,  "preacher@yahoo.com", 8, "765-986-2345"),
                  ("Barry", "TightFisted", "7864556", "M", 38,  115000,  "boxer@yahoo.com", 5, "569-432-7654"),
                  ("Ken", "Chang", "9845376", "M", 26,  105890,  "bongbonyahoo.com", 10, "434-987-1200"),
                  ("Alhaji", "Kareem", "87565234", "M", 44,  65000,  "uytrew@yahoo.com", 9, "543-210-3400"),
                  ("Islam", "Aboubacar", "8719865", "M", 32,  186100,  "westerm@yahoo.com", 4, "540-872-1000"),
                  ("Meghan", "Markle", "7645348", "M", 44,  91000,  "Missyr@yahoo.com", 2, "569-349-1200")
                  ]


schema_employee = StructType([ \
    StructField("First_name",    StringType(), True), \
    StructField("Last_name",     StringType(), True), \
    StructField("Employee_ID",   StringType(), True), \
    StructField("Gender",        StringType(), True), \
    StructField("Age",           StringType(), True), \
    StructField("Salary_(USD)",     IntegerType(), True), \
    StructField("email_address",        StringType(), True), \
    StructField("Experience_(yrs)",  StringType(), True), \
    StructField("Phone_nos",           StringType(), True), \
                                                                             
])

df_employee = spark.createDataFrame(data=employee_data2 ,schema=schema_employee)
df_employee.printSchema()

root
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Employee_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Salary_(USD): integer (nullable = true)
 |-- email_address: string (nullable = true)
 |-- Experience_(yrs): string (nullable = true)
 |-- Phone_nos: string (nullable = true)



Creating Spark Dataframes
Pyspark DataFrames are generally preferred over RDDs for most data processing tasks because of High-level abstraction, Optimization, Structured API, Schema inference and enforcement, and Integration with Spark SQL, and support for various data source


Using CreateOrReplaceTempView

One of the main advantages of Apache PySpark is working with SQL along with DataFrame/Dataset API. So, if an engineer is comfortable with SQL, he/she can create a temporary view on DataFrame/Dataset by using createOrReplaceTempView() and using SQL to select and manipulate the data.

createOrReplaceTempView creates (or replaces if that view name already exists) a lazily evaluated "view" that you can then # use like a hive table in Spark SQL. It does not persist to memory unless you cache the dataset that underpins the view.


In [0]:
df_employee.createOrReplaceTempView("employee_data_table_name")


In [0]:
# View all data columns from temp view
spark.sql("show columns from employee_data_table_name").display()

col_name
First_name
Last_name
Employee_ID
Gender
Age
Salary_(USD)
email_address
Experience_(yrs)
Phone_nos


In [0]:

# Run SQL Query
spark.sql("select First_Name, Last_Name from employee_data_table_name").display()


First_Name,Last_Name
Emmanuel,Oyekanlu
Don,Coder
Henry,Charles
Stephen,Smith
Rose,CarlyWiggle
Diddier,Thomas
Carla,Fisher
Yinka,Eromonsele
Rod,BiggerStewart
Oliver,Twist


If you want to have a temporary view that is shared among all sessions and keep alive until the PySpark application terminates, you can create a global temporary view using createGlobalTempView()

In [0]:
# Create a Global temp view and readh some data from it
# df_employee.createGlobalTempView("employee_GlobalViewdata_table_name")
# spark.sql("select Last_Name, Age from employee_GlobalViewdata_table_name").display()


PySpark cache()

Using the PySpark cache() method we can cache the results of transformations. Unlike persist(), cache() has no arguments to specify the storage levels because it stores in-memory only. Persist with storage-level as MEMORY-ONLY is equal to cache().


# Syntax
DataFrame.cache()


In [0]:
# Get only male employee data and cache it
df_employee_male_only = df_employee.where(col("Gender") =="M").cache()

# Get the count of the male employees from the cached data
count_male = df_employee_male_only.count()
count_male



Out[94]: 13

In [0]:
# Get the salary details of male employees from the cached data
df_employee_male_only = df_employee_male_only.where(col("Salary_(USD)") >= 100000)
count_salary = df_employee_male_only.count()
count_salary

Out[95]: 10

In [0]:
df_employee_male_only.display()

First_name,Last_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos
Emmanuel,Oyekanlu,6111876,M,33,237000,manuelbomi@yahoo.com,11,640-137-0000
Don,Coder,387654,M,30,210000,python-Coder@gmail.com,12,540-543-3245
Henry,Charles,3000127,M,42,210000,massie@yahoo.com,7,127-196-5676
Stephen,Smith,9087655,M,38,156000,miutss@karen.com,5,234-112-9812
Diddier,Thomas,6347652,M,53,237000,potyur@yahoo.com,19,653-239-9876
Oliver,Twist,7652423,M,33,200000,Twister@yahoo.com,7,764-129-9009
Moses,Aaron,9876543,M,23,186000,Moses@cnn.com,6,456-987-2324
Barry,TightFisted,7864556,M,38,115000,boxer@yahoo.com,5,569-432-7654
Ken,Chang,9845376,M,26,105890,bongbonyahoo.com,10,434-987-1200
Islam,Aboubacar,8719865,M,32,186100,westerm@yahoo.com,4,540-872-1000


In [0]:
# another employee data
another_employee_data = [("Casey", "Donner", "1876", "M", 44,  237000,  "maryl@yahoo.com", 2, "819-137-060"),
                  ("MariGold", "Joelyn", "387654", "F", 32,  219870,  "eryl@gmail.com", 34, "540-543-3785"), 
                  ("Katie", "Henshaw", "304127", "F", 39,  20340,  "Kahenshaw@yahoo.com", 5, "497-196-476"),
                  ("Derrick", "Smith", "87655", "M", 38,  100000,  "gh@karen.com", 5, "674-112-962"),
                  ("Debbie", "Aaron", "5667876", "F", 27,  115000,  "morel@yahoo.com", 34, "496-47-7919"),
                  ("Rex", "Migler", "6647652", "M", 23,  567000,  "pouhgr@yahoo.com", 56, "883-239-7776"),
                 
                  ]


schema_employee = StructType([ \
    StructField("First_name",    StringType(), True), \
    StructField("Last_name",     StringType(), True), \
    StructField("Employee_ID",   StringType(), True), \
    StructField("Gender",        StringType(), True), \
    StructField("Age",           StringType(), True), \
    StructField("Salary_(USD)",     IntegerType(), True), \
    StructField("email_address",        StringType(), True), \
    StructField("Experience_(yrs)",  StringType(), True), \
    StructField("Phone_nos",           StringType(), True), \
                                                                             
])

another_employee_data = spark.createDataFrame(data=another_employee_data ,schema=schema_employee)
another_employee_data.printSchema()

root
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Employee_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Salary_(USD): integer (nullable = true)
 |-- email_address: string (nullable = true)
 |-- Experience_(yrs): string (nullable = true)
 |-- Phone_nos: string (nullable = true)



In [0]:
another_employee_data.display()

First_name,Last_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos
Casey,Donner,1876,M,44,237000,maryl@yahoo.com,2,819-137-060
MariGold,Joelyn,387654,F,32,219870,eryl@gmail.com,34,540-543-3785
Katie,Henshaw,304127,F,39,20340,Kahenshaw@yahoo.com,5,497-196-476
Derrick,Smith,87655,M,38,100000,gh@karen.com,5,674-112-962
Debbie,Aaron,5667876,F,27,115000,morel@yahoo.com,34,496-47-7919
Rex,Migler,6647652,M,23,567000,pouhgr@yahoo.com,56,883-239-7776


In [0]:
df_employee.show(truncate=False)

+----------+-------------+-----------+------+---+------------+----------------------+----------------+------------+
|First_name|Last_name    |Employee_ID|Gender|Age|Salary_(USD)|email_address         |Experience_(yrs)|Phone_nos   |
+----------+-------------+-----------+------+---+------------+----------------------+----------------+------------+
|Emmanuel  |Oyekanlu     |6111876    |M     |33 |237000      |manuelbomi@yahoo.com  |11              |640-137-0000|
|Don       |Coder        |387654     |M     |30 |210000      |python-Coder@gmail.com|12              |540-543-3245|
|Henry     |Charles      |3000127    |M     |42 |210000      |massie@yahoo.com      |7               |127-196-5676|
|Stephen   |Smith        |9087655    |M     |38 |156000      |miutss@karen.com      |5               |234-112-9812|
|Rose      |CarlyWiggle  |5609876    |F     |24 |237000      |iutyrr@yahoo.com      |18              |876-137-0119|
|Diddier   |Thomas       |6347652    |M     |53 |237000      |potyur@yah

Using User-Defined Functions (UDFs)

In [0]:
## Using User Defined Function
# Define some arbitrary function that will be used to encode a new column
# from pyspark.sql.functions import udf

def encode_employee_gender_age(gender, age):
    
    if gender == "M":
        return 20
    elif gender == "F":
        return 30
    elif age < 20 :
        return 40
    else:
        return 0
# Convert to udf function and model the new column as an integer type column
function_with_udf = udf(f= encode_employee_gender_age, returnType= IntegerType())
# Create new column
updated_employee_data = df_employee.withColumn("another_column_using_udf", 
                                             function_with_udf(df_employee["Gender"], 
                                                               df_employee["Age"]))

updated_employee_data.display()

First_name,Last_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos,another_column_using_udf
Emmanuel,Oyekanlu,6111876,M,33,237000,manuelbomi@yahoo.com,11,640-137-0000,20
Don,Coder,387654,M,30,210000,python-Coder@gmail.com,12,540-543-3245,20
Henry,Charles,3000127,M,42,210000,massie@yahoo.com,7,127-196-5676,20
Stephen,Smith,9087655,M,38,156000,miutss@karen.com,5,234-112-9812,20
Rose,CarlyWiggle,5609876,F,24,237000,iutyrr@yahoo.com,18,876-137-0119,30
Diddier,Thomas,6347652,M,53,237000,potyur@yahoo.com,19,653-239-9876,20
Carla,Fisher,9871234,F,28,121000,tuyinmg@yahoo.com,3,555-666-9876,30
Yinka,Eromonsele,547863,F,29,99500,eromonsele@yahoo.com,12,652-653-0987,30
Rod,BiggerStewart,698328,M,54,76500,BiggerS@yahoo.com,12,640-137-0000,20
Oliver,Twist,7652423,M,33,200000,Twister@yahoo.com,7,764-129-9009,20


Sorting, Ordering and Joining Data

In [0]:
# Sort the data with using a colum of the data
employee_data_sorted_df = df_employee.orderBy(["Age"])
employee_data_sorted_df.display()

First_name,Last_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos
Moses,Aaron,9876543,M,23,186000,Moses@cnn.com,6,456-987-2324
Rose,CarlyWiggle,5609876,F,24,237000,iutyrr@yahoo.com,18,876-137-0119
Ken,Chang,9845376,M,26,105890,bongbonyahoo.com,10,434-987-1200
Carla,Fisher,9871234,F,28,121000,tuyinmg@yahoo.com,3,555-666-9876
Yinka,Eromonsele,547863,F,29,99500,eromonsele@yahoo.com,12,652-653-0987
Don,Coder,387654,M,30,210000,python-Coder@gmail.com,12,540-543-3245
Islam,Aboubacar,8719865,M,32,186100,westerm@yahoo.com,4,540-872-1000
Emmanuel,Oyekanlu,6111876,M,33,237000,manuelbomi@yahoo.com,11,640-137-0000
Oliver,Twist,7652423,M,33,200000,Twister@yahoo.com,7,764-129-9009
Barry,TightFisted,7864556,M,38,115000,boxer@yahoo.com,5,569-432-7654


In [0]:
# Join two datasets (Inner Join)
# Inner join
inner_join_df = df_employee.join(another_employee_data, 'Age')
inner_join_df.display()

Age,First_name,Last_name,Employee_ID,Gender,Salary_(USD),email_address,Experience_(yrs),Phone_nos,First_name.1,Last_name.1,Employee_ID.1,Gender.1,Salary_(USD).1,email_address.1,Experience_(yrs).1,Phone_nos.1
23,Moses,Aaron,9876543,M,186000,Moses@cnn.com,6,456-987-2324,Rex,Migler,6647652,M,567000,pouhgr@yahoo.com,56,883-239-7776
32,Islam,Aboubacar,8719865,M,186100,westerm@yahoo.com,4,540-872-1000,MariGold,Joelyn,387654,F,219870,eryl@gmail.com,34,540-543-3785
38,Stephen,Smith,9087655,M,156000,miutss@karen.com,5,234-112-9812,Derrick,Smith,87655,M,100000,gh@karen.com,5,674-112-962
38,Barry,TightFisted,7864556,M,115000,boxer@yahoo.com,5,569-432-7654,Derrick,Smith,87655,M,100000,gh@karen.com,5,674-112-962
39,Molly,Van Modeller,6487653,F,232000,preacher@yahoo.com,8,765-986-2345,Katie,Henshaw,304127,F,20340,Kahenshaw@yahoo.com,5,497-196-476
44,Alhaji,Kareem,87565234,M,65000,uytrew@yahoo.com,9,543-210-3400,Casey,Donner,1876,M,237000,maryl@yahoo.com,2,819-137-060
44,Meghan,Markle,7645348,M,91000,Missyr@yahoo.com,2,569-349-1200,Casey,Donner,1876,M,237000,maryl@yahoo.com,2,819-137-060


In [0]:
# Right outer join on 'Last Name'
right_outer_join_df = df_employee.join(another_employee_data, 'Last_Name', 'rightouter')
right_outer_join_df.display()

Last_name,First_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos,First_name.1,Employee_ID.1,Gender.1,Age.1,Salary_(USD).1,email_address.1,Experience_(yrs).1,Phone_nos.1
Donner,,,,,,,,,Casey,1876,M,44,237000,maryl@yahoo.com,2,819-137-060
Joelyn,,,,,,,,,MariGold,387654,F,32,219870,eryl@gmail.com,34,540-543-3785
Henshaw,,,,,,,,,Katie,304127,F,39,20340,Kahenshaw@yahoo.com,5,497-196-476
Smith,Stephen,9087655.0,M,38.0,156000.0,miutss@karen.com,5.0,234-112-9812,Derrick,87655,M,38,100000,gh@karen.com,5,674-112-962
Aaron,Moses,9876543.0,M,23.0,186000.0,Moses@cnn.com,6.0,456-987-2324,Debbie,5667876,F,27,115000,morel@yahoo.com,34,496-47-7919
Migler,,,,,,,,,Rex,6647652,M,23,567000,pouhgr@yahoo.com,56,883-239-7776


In [0]:
# left outer join on 'Last Name'
left_outer_join_df = df_employee.join(another_employee_data, 'Last_Name', 'leftouter')
left_outer_join_df.display()

Last_name,First_name,Employee_ID,Gender,Age,Salary_(USD),email_address,Experience_(yrs),Phone_nos,First_name.1,Employee_ID.1,Gender.1,Age.1,Salary_(USD).1,email_address.1,Experience_(yrs).1,Phone_nos.1
Coder,Don,387654,M,30,210000,python-Coder@gmail.com,12,540-543-3245,,,,,,,,
Oyekanlu,Emmanuel,6111876,M,33,237000,manuelbomi@yahoo.com,11,640-137-0000,,,,,,,,
Smith,Stephen,9087655,M,38,156000,miutss@karen.com,5,234-112-9812,Derrick,87655.0,M,38.0,100000.0,gh@karen.com,5.0,674-112-962
Charles,Henry,3000127,M,42,210000,massie@yahoo.com,7,127-196-5676,,,,,,,,
CarlyWiggle,Rose,5609876,F,24,237000,iutyrr@yahoo.com,18,876-137-0119,,,,,,,,
Thomas,Diddier,6347652,M,53,237000,potyur@yahoo.com,19,653-239-9876,,,,,,,,
Fisher,Carla,9871234,F,28,121000,tuyinmg@yahoo.com,3,555-666-9876,,,,,,,,
Eromonsele,Yinka,547863,F,29,99500,eromonsele@yahoo.com,12,652-653-0987,,,,,,,,
BiggerStewart,Rod,698328,M,54,76500,BiggerS@yahoo.com,12,640-137-0000,,,,,,,,
Twist,Oliver,7652423,M,33,200000,Twister@yahoo.com,7,764-129-9009,,,,,,,,


Collating/Imploding Multiple Data Columns into a Single Column

In [0]:
# Convert columns to Map
# Merge several columns to become a single column
# Merge the 'Salary, Experience, Age' colums to become a single column "EmployeeDetails"
from pyspark.sql.functions import col,lit,create_map
df_employee = df_employee.withColumn("EmployeesDetails",create_map(
        lit("salary"),col("Salary_(USD)"),
        lit("experience"),col("Experience_(yrs)"),
        lit("age"),col("Age")
        )).drop("Salary_(USD)","Experience_(yrs)", "Age")
df_employee.printSchema()



root
 |-- First_name: string (nullable = true)
 |-- Last_name: string (nullable = true)
 |-- Employee_ID: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- email_address: string (nullable = true)
 |-- Phone_nos: string (nullable = true)
 |-- EmployeesDetails: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
df_employee.show(truncate=False)

+----------+-------------+-----------+------+----------------------+------------+-----------------------------------------------+
|First_name|Last_name    |Employee_ID|Gender|email_address         |Phone_nos   |EmployeesDetails                               |
+----------+-------------+-----------+------+----------------------+------------+-----------------------------------------------+
|Emmanuel  |Oyekanlu     |6111876    |M     |manuelbomi@yahoo.com  |640-137-0000|{salary -> 237000, experience -> 11, age -> 33}|
|Don       |Coder        |387654     |M     |python-Coder@gmail.com|540-543-3245|{salary -> 210000, experience -> 12, age -> 30}|
|Henry     |Charles      |3000127    |M     |massie@yahoo.com      |127-196-5676|{salary -> 210000, experience -> 7, age -> 42} |
|Stephen   |Smith        |9087655    |M     |miutss@karen.com      |234-112-9812|{salary -> 156000, experience -> 5, age -> 38} |
|Rose      |CarlyWiggle  |5609876    |F     |iutyrr@yahoo.com      |876-137-0119|{salary -

In [0]:
# spark.stop()



In [0]:
#### Other data
# import pyspark
# from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType,StructField, StringType, IntegerType

# spark = SparkSession.builder.master("local[1]") \
#                     .appName('Data_Modelling_on_AWS_Databricks_with_Spark') \
#                     .getOrCreate()

# employee_data2 = [("Emmanuel", "Oyekanlu", "6111876", "M", 33, "330 CI Way, Kentucky, KI 7634", 237000, "Married", "manuelbomi@yahoo.com", 11, "640-137-0000"),
#                   ("Don", "Coder", "387654", "M", 30, "118 College Drive, Kentucky, KI 76542", 210000, "Single", "python-Coder@gmail.com", 12, "540-543-3245"), 
#                   ("Henry", "Charles", "3000127", "M", 42, "54 College Drive, Kentucky, KI 76542", 210000, "Single", "massie@yahoo.com", 7, "127-196-5676"),
#                   ("Stephen", "Smith", "9087655", "M", 38, "2204 Market Lane, Kentucky, KI 9087", 156000, "Relationship", "miutss@karen.com", 5, "234-112-9812"),
#                   ("Rose", "CarlyWiggle", "5609876", "F", 24, "1902 Washington Way, Kentucky, KI 5467", 237000, "Single", "iutyrr@yahoo.com", 18, "876-137-0119"),
#                   ("Diddier", "Thomas", "6347652", "M", 53, "184 College Drive, Kentucky, KI 76542", 237000, "Married", "potyur@yahoo.com", 19, "653-239-9876"),
#                   ("Carla", "Fisher", "9871234", "F", 28, "184 CI Way, Kentucky, KI 7634", 121000, "Single", "tuyinmg@yahoo.com", 3, "555-666-9876"),
#                   ("Yinka", "Eromonsele", "547863", "F", 29, "660 CI Way, Kentucky, KI 7634", 99500, "Married", "eromonsele@yahoo.com", 12, "652-653-0987"),
#                   ("Rod", "BiggerStewart", "698328", "M", 54, "12 West Pine Stree, Kentucky, KI 7645", 76500, "Relationship", "BiggerS@yahoo.com", 12, "640-137-0000"),
#                   ("Oliver", "Twist", "7652423", "M", 33, "22 West Pine Stree, Kentucky, KI 7645", 200000, "Married", "Twister@yahoo.com", 7, "764-129-9009"),
#                   ("Moses", "Aaron", "9876543", "M", 23, "1006 East Pine Stree, Kentucky, KI 7645", 186000, "Relationship", "Moses@cnn.com", 6, "456-987-2324"),
#                   ("Molly", "Van Modeller", "6487653", "F", 39, "2987 Market Lane, Kentucky, KI 9087", 232000, "Married", "preacher@yahoo.com", 8, "765-986-2345"),
#                   ("Barry", "TightFisted", "7864556", "M", 38, "187 College Drive, Kentucky, KI 76542", 115000, "Single", "boxer@yahoo.com", 5, "569-432-7654"),
#                   ("Ken", "Chang", "9845376", "M", 26, "61 Horseheads Dr, Kentucky, KI 78759", 105890, "Married", "bongbonyahoo.com", 10, "434-987-1200"),
#                   ("Alhaji", "Kareem", "87565234", "M", 44, "I8 Painted Post Dr, Kentucky, KI 7634", 65000, "Relationship", "uytrew@yahoo.com", 9, "543-210-3400"),
#                   ("Islam", "Aboubacar", "8719865", "M", 32, "Vero Candy Way, Kentucky, KI 7634", 186100, "Single", "westerm@yahoo.com", 4, "540-872-1000"),
#                   ("Meghan", "Markle", "7645348", "M", 44, "1007 CI Way, Kentucky, KI 7634", 91000, "Married", "Missyr@yahoo.com", 2, "569-349-1200")
#                   ]


# schema_employee = StructType([ \
#     StructField("First_name",    StringType(), True), \
#     StructField("Last_name",     StringType(), True), \
#     StructField("Employee_ID",   StringType(), True), \
#     StructField("Gender",        StringType(), True), \
#     StructField("Age",           StringType(), True), \
#     StructField("Address",       StringType(), True), \
#     StructField("Salary (USD)",     IntegerType(), True), \
#     StructField("Marital_Status",   StringType(), True), \
#     StructField("email_address",        StringType(), True), \
#     StructField("Years_of_experience",  StringType(), True), \
#     StructField("Phone_nos",           StringType(), True), \
                                                                                      
  
   

#   ])

# df_employee = spark.createDataFrame(data=employee_data2 ,schema=schema_employee)
# df_employee.printSchema()

