### Create a Dataframe using 2d list

In [0]:
spark

In [0]:
data_2d_list = [
    ["Alice", 30, "Alice@gmail.com"],
    ["Bob", 25, "Bob@hotmail.com"],
    ["Charlie", 35, "Charlie@gmail.com"],
    ["Jenie", 25, "Jenie@hotmail.com"]
]

df = spark.createDataFrame(data_2d_list, schema=["name", "age", "email"])
display(df)

In [0]:
df.printSchema()

### Create a Dataframe using Dictionary

In [0]:
data_dict = [
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25},
    {"name": "Charlie", "age": 35}
]

df = spark.createDataFrame(data_dict)
display(df)

### Create a Dataframe using Row

**Row** refers to the `pyspark.sql.Row` class, which is a fundamental data structure representing a single row of data within a DataFrame. It is an _immutable_, dynamically typed object containing a set of key-value pairs, where the keys correspond to the column names in the DataFrame. 

In [0]:
from pyspark.sql import Row

data = [
    Row(name="Alice", age=30),
    Row(name="Bob", age=25),
    Row(name="Charlie", age=35)
]

df = spark.createDataFrame(data)
display(df)

In [0]:
data_dict = {
    "name": ["George", "Hannah", "Ian"],
    "age": [22, 29, 34]
}

rows = [Row(name=n, age=a) for n, a in zip(data_dict["name"], data_dict["age"])]
df_dict = spark.createDataFrame(rows)
display(df_dict)

### Create a dataframe with schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import Row

data = [
    Row(name="Alice", age=30),
    Row(name="Bob", age=25),
    Row(name="Charlie", age=35),
    Row(name="Jenie", age=25)
]

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

In [0]:
df = spark.createDataFrame(data, schema)
df.printSchema()
display(df)

### Create a dataframe with numpy array

In [0]:
import numpy as np

data = np.array([
    [1, "Alice"],
    [2, "Bob"],
    [3, "Charlie"]
])

data_list = [(int(row[0]), str(row[1])) for row in data]

df = spark.createDataFrame(data_list, schema=["id", "name"])


display(df)

In [0]:
json_data = '''
[
    {"name": "Alice", "age": 30},
    {"name": "Bob", "age": 25},
    {"name": "Charlie", "age": 35},
    {"name": "Jenie", "age": 25}
]
'''

df_json = spark.read.json(sc.parallelize([json_data]))

display(df_json)