In [9]:
import warnings
warnings.filterwarnings("ignore")

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, ArrayType, Row

In [4]:
spark = SparkSession.builder.appName("sparksql-tutorial").getOrCreate()

In [7]:
# define the structure to the data frame
schema = StructType([
    StructField(name="FirstName", dataType=StringType(), nullable=False),
    StructField(name="LastName", dataType=StringType(), nullable=False),
    StructField(name="Age", dataType=IntegerType(), nullable=False),
    StructField(name="Place", dataType=StringType(), nullable=False),
    StructField(name="Salary", dataType=LongType(), nullable=False),
    StructField(name="Department", dataType=StringType(), nullable=False),
    StructField(name="Technologies", dataType=ArrayType(elementType=StringType()), nullable=False),
])

In [10]:
# create the data rows as per the schema defined
rows = [
    Row("Pavan","Mantha",36,"Hyderabad",273567,"SPS",["java","spring boot","data science","react","node"]),
    Row("Arun","Boppudi",36,"Guntur",303567,"Aero",["java","spring boot","cloud","react","node"]),
    Row("Ravi","Vadlamani",26,"Visakapatnam",213567,"Aero",["express","data structures","react"]),
    Row("Mahender","M",21,"Hyderabad",153567,"Aero",["java","spring boot","express","react","node"]),
    Row("Manoj","Manoj",21,"Guntur",183567,"Aero",["express","react"]),
    Row("Manoj","Velecheti",21,"Visakapatnam",223567,"Aero",["java","spring boot","express","react"]),
]

In [11]:
parallel_rows = spark.sparkContext.parallelize(rows)

In [13]:
# createDataFrame is used to create dataframe manually
df = spark.createDataFrame(parallel_rows, schema, verifySchema=True)

In [14]:
df.show()

                                                                                

+---------+---------+---+------------+------+----------+--------------------+
|FirstName| LastName|Age|       Place|Salary|Department|        Technologies|
+---------+---------+---+------------+------+----------+--------------------+
|    Pavan|   Mantha| 36|   Hyderabad|273567|       SPS|[java, spring boo...|
|     Arun|  Boppudi| 36|      Guntur|303567|      Aero|[java, spring boo...|
|     Ravi|Vadlamani| 26|Visakapatnam|213567|      Aero|[express, data st...|
| Mahender|        M| 21|   Hyderabad|153567|      Aero|[java, spring boo...|
|    Manoj|    Manoj| 21|      Guntur|183567|      Aero|    [express, react]|
|    Manoj|Velecheti| 21|Visakapatnam|223567|      Aero|[java, spring boo...|
+---------+---------+---+------------+------+----------+--------------------+



In [15]:
# this will show the schema of the given dataframe
df.printSchema()

root
 |-- FirstName: string (nullable = false)
 |-- LastName: string (nullable = false)
 |-- Age: integer (nullable = false)
 |-- Place: string (nullable = false)
 |-- Salary: long (nullable = false)
 |-- Department: string (nullable = false)
 |-- Technologies: array (nullable = false)
 |    |-- element: string (containsNull = true)



## END