# PySpark - Create DataFrame

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

In [4]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd = spark.sparkContext.parallelize(data)

dfFromRDD1.printSchema()

#  toDF() 

In [5]:
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



# createDataFrame()

In [6]:
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



In [7]:
from pyspark.sql import Row
rowData = map(lambda x: Row(*x), data) 
dfFromData3 = spark.createDataFrame(rowData,columns)

In [8]:
dfFromData3.show(2)
#dfFromData3.head(2)

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
+--------+-----------+
only showing top 2 rows



# Create DataFrame with schema

In [9]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",2500)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |2500  |
+---------+----------+--------+-----+------+------+



# Creating DataFrame from CSV

In [10]:
df2 = spark.read.csv("C:\Interpreter\mockd2.csv",header=True)

In [11]:
df2.show(5)

+------------+-----------+
|subcriber_sk|     gsm_no|
+------------+-----------+
|          74|867-29-4047|
|       32271|169-08-4615|
|       54456|408-58-2028|
|         929|230-71-4272|
|       01526|246-72-3378|
+------------+-----------+
only showing top 5 rows



In [12]:
df2 = spark.read.json("C:\Interpreter\mockd3.json")

In [13]:
df2.show()

+--------------------+--------------------+----------+------+----+---------------+----------+
|     _corrupt_record|               email|first_name|gender|  id|     ip_address| last_name|
+--------------------+--------------------+----------+------+----+---------------+----------+
|[{"id":1,"first_n...|                null|      null|  null|null|           null|      null|
|                null|  kspitell1@yelp.com|     Kalli|Female|   2|253.133.197.252|   Spitell|
|                null|ttwelves2@tinypic...|  Trumaine|  Male|   3|151.170.243.211|   Twelves|
|                null|afilippozzi3@goog...|    Ashley|Female|   4| 137.189.42.195|Filippozzi|
|                null|     geltun4@loc.gov|  Georgeta|Female|   5|    29.36.64.20|     Eltun|
|                null|  ccotgrove5@mail.ru|      Chad|Female|   6| 159.145.44.217|  Cotgrove|
|                null|nlilion6@feedburn...|  Northrop|  Male|   7|    31.21.59.43|    Lilion|
|                null|bcrocetti7@dagond...|       Bay|  Male

#  4. Other sources (Avro, Parquet, ORC, Kafka)
https://sparkbyexamples.com/pyspark/different-ways-to-create-dataframe-in-pyspark/