#1_How to create Pyspark Dataframe using python list

### Creating spark session

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

### 1_Create a DataFrame from a list of tuples

In [0]:
# List of Tuples
data = [(1, "Mustaq", 32, "ADF"),
        (2, "Ali", 30, "Databricks"),
        (3, "Ahmed", 35, "Spark"),
        (4, "Adnan", 28, "Python"),
        (5, "Ishan", 25, "Java")]

# Define column Names
columns = ["id", "name", "age", "department"]


In [0]:
# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Display DataFrame
display(df)

In [0]:
#List of Tuples
data = [(1, "Mustaq", 32, "ADF"),
        (2, "Ali", 30, "Databricks"),
        (3, "Ahmed", 35, "Spark"),
        (4, "Adnan", 28, "Python"),
        (5, "Ishan", 25, "Java")]

# Create DataFrame
df1 = spark.createDataFrame(data, 'Id int, Name string, Age int, Department string')
display(df1)

### 2_Create DataFrame from a List of Lists
  - If your list contains list instead of tuples

In [0]:
# list of lists
data = [[1, "Mustaq", 32, "ADF"],
        [2, "Ali", 30, "Databricks"],
        [3, "Ahmed", 35, "Spark"],
        [4, "Adnan", 28, "Python"],
        [5, "Ishan", 25, "Java"]]

# Define column Names
columns = ["id", "name", "age"]

# Create DataFrame
df2 = spark.createDataFrame(data, schema=columns)
display(df2)

### 3_Create a DataFrame using Dictionary

In [0]:
data = [{'Name' : 'Mustaq', 'Id' : 'A001', 'Country' : 'India'},
         {'Name' : 'Ali', 'Id' : 'A002', 'Country' : 'USA'},
         {'Name' : 'Ahmed', 'Id' : 'A003', 'Country' : 'Canada'},
         {'Name' : 'Adnan', 'Id' : 'A004', 'Country' : 'UK'},
         {'Name' : 'Ishan', 'Id' : 'A005', 'Country' : 'Australia'}]

df_dict = spark.createDataFrame(data)
display(df_dict)

### 4_Create a DataFrame from a simple list
- If your list contains a single column, you can still use createDataFrame

In [0]:
data = [1, 23, 35, 45, 58, 69, 72, 80, 91, 3]

df3 = spark.createDataFrame(data, 'int')
display(df3)

In [0]:
# How to replace column name
df3 = df3.withColumnRenamed('value', 'Numbers')
display(df3)

In [0]:
data = [1, 23, 35, 45, 58, 69, 72, 80, 91, 3]
df4 = spark.createDataFrame([(x,) for x in data], ["Numbers"])
display(df4)

In [0]:
# Create a sample DataFrame
data = [(1,), (2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,)]
df41 = spark.createDataFrame(data, ["Id"])
display(df41)

### 5_Create a DataFrame with an Explicit Schema
- You can define the schema explicitly using **StuctType** and **structField**

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

data = [(1, "Mustaq", 32, "ADF"),
        (2, "Ali", 30, "Databricks"),
        (3, "Ahmed", 35, "Spark"),
        (4, "Adnan", 28, "Python"),
        (5, "Ishan", 25, "Java")]

schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Department", StringType(), True)
])

df_schema = spark.createDataFrame(data, schema=schema)
display(df_schema)

In [0]:
# Create student data with 6 rows and 6 attributes

students = [['001', 'Mustaq', 23, 5.67, 67, 'Chennai'],
            ['002', 'Ali', 25, 3.79, 34, 'Hyderabad'],
            ['003', 'Ahmed', 27, 4.56, 17, 'Bangalore'],
            ['004', 'Adnan', 29, 3.69, 28, 'Mumbai'],
            ['005', 'Ishan', 23, 4.12, 54, 'Delhi'],
            ['006', 'Atif', 25, 3.79, 25, 'Pune']]


In [0]:
# define the StructType and StructField for the below  column names

schema = """
rollno string,
name string,
age int,
height float,
weight int,
city string"""

In [0]:
# create the DataFrame and add schema to the DataFrame
df_schema_string = spark.createDataFrame(students,schema=schema)
display(df_schema_string)

In [0]:
data = [(1, 'Mustaq', [20, 30, 40]),
        (2, 'Ali', [30, 40, 50]),
        (3, 'Ahmed', []),
        (4, 'Adnan', [50, 60, None])]
         
df_schema_def = spark.createDataFrame(data, schema = "Id int, Name string, Marks array<int>")
display(df_schema_def)

### 6_Create a DataFrame Directly from a list using row

In [0]:
from pyspark.sql import Row

data = [Row(Id=1, Name='Mustaq', Age=25, Department='Testing', Technology='ETL'),
        Row(Id=2, Name='Ali', Age=23, Department='Automation Testing', Technology='Java Selenium'),
        Row(Id=3, Name='Ahmed', Age=27, Department='Manual Testing', Technology='Python'),
        Row(Id=4, Name='Adnan', Age=29, Department='Testing', Technology='Python'),
        Row(Id=5, Name='Ishan', Age=23, Department='Testing', Technology='Java')]

df_row = spark.createDataFrame(data)
display(df_row)

### 7_toDF()

In [0]:
employees = [(1, "Mustaq", 32000, 20, "New York"),
             (2, "Ali", 43000, 30, "California"),
             (3, "Ahmed", 36000, 40, "Texas"),
             (4, "Adnan", 36000, 50, "Florida"),
             (5, "Ishan", 36000, 60, "New York")]

# Create the DataFrame
df_todf = spark.createDataFrame(employees).\
    toDF("Id", "Name", "Salary", "Dept Id", "City")
display(df_todf)