# PySpark - Create Data Frame from List or RDD on the fly

In [1]:
# Create Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Create DF on fly") \
    .master("local[*]") \
    .getOrCreate()

spark

### SparkSession range() method

In [7]:
# Create an Dataframe from range of values
df_range_1 = spark.range(5)
df_range_1.show(5, truncate = False)

+---+
|id |
+---+
|0  |
|1  |
|2  |
|3  |
|4  |
+---+



In [8]:
# You can optionally specify start, end and steps as well
df_range_2 = spark.range(start = 1, end = 10, step = 2)
df_range_2.show(10, False)

+---+
|id |
+---+
|1  |
|3  |
|5  |
|7  |
|9  |
+---+



### Spark createDataFrame() method

In [44]:
# Create Python Native List of Data
_data = [
    [1, "Ram"],
    [2, "Shyam"],
    [3, "Asraf"],
    [4, None]
]

# Create the list of column names
_cols = ["id", "name"]

In [47]:
# Create Data Frame using the createDataFrame method
df_users = spark.createDataFrame(data = _data, schema=_cols)
df_users.printSchema()

# Check Data Frame
df_users.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

+---+-----+
|id |name |
+---+-----+
|1  |Ram  |
|2  |Shyam|
|3  |Asraf|
|4  |null |
+---+-----+



### Spark toDF() method

In [55]:
# From the same data list we create new RDD

_data_rdd = spark.sparkContext.parallelize(_data)
_data_rdd.collect()

# To check number of partitions of the data
# _data_rdd.getNumPartitions()

[[1, 'Ram'], [2, 'Shyam'], [3, 'Asraf'], [4, None]]

In [60]:
 # Create Data Frame from the rdd

df_users_new = _data_rdd.toDF(_cols)
df_users_new.show()

+---+-----+
| id| name|
+---+-----+
|  1|  Ram|
|  2|Shyam|
|  3|Asraf|
|  4| null|
+---+-----+

