In [None]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
.appName('Dataframe Creation in Spark') \
.enableHiveSupport() \
.getOrCreate()

# Spark Read

In [3]:
df_csv = spark.read.format('csv').option('header','true').load('/data/customers_100.csv')
df_csv.show()

                                                                                

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    False|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     True|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     True|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    False|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    False|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    False|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    False|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     True|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|   

# Spark SQL

In [4]:
spark.sql('show tables').show()

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used


+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|external_customers_2|      false|
+---------+--------------------+-----------+



In [6]:
df_sql = spark.sql('select * from external_customers_2 where is_active=True')
df_sql.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|     true|
|          8| Customer_8|     Pune|  Karnataka|  India|       2023-06-22|     true|
|          9| Customer_9|   Mumbai|  Telangana|  India|       2023-01-05|     true|
|         10|Customer_10|     Pune|    Gujarat|  India|       2023-08-05|     true|
|         13|Customer_13|  Chennai|  Karnataka|  India|       2023-11-06|     true|
|         15|Customer_15|   Mumbai|    Gujarat|  India|       2023-03-02|     true|
|         18|Customer_18|     Pune|      Delhi|  India|       2023-10-04|   

# spark.table

In [7]:
df_table = spark.table('external_customers_2')

In [8]:
df_table.show()

+-----------+-----------+---------+-----------+-------+-----------------+---------+
|customer_id|       name|     city|      state|country|registration_date|is_active|
+-----------+-----------+---------+-----------+-------+-----------------+---------+
|       null|       name|     city|      state|country|registration_date|     null|
|          0| Customer_0|     Pune|Maharashtra|  India|       2023-06-29|    false|
|          1| Customer_1|Bangalore| Tamil Nadu|  India|       2023-12-07|     true|
|          2| Customer_2|Hyderabad|    Gujarat|  India|       2023-10-27|     true|
|          3| Customer_3|Bangalore|  Karnataka|  India|       2023-10-17|    false|
|          4| Customer_4|Ahmedabad|  Karnataka|  India|       2023-03-14|    false|
|          5| Customer_5|Hyderabad|  Karnataka|  India|       2023-07-28|    false|
|          6| Customer_6|     Pune|      Delhi|  India|       2023-08-29|    false|
|          7| Customer_7|Ahmedabad|West Bengal|  India|       2023-12-28|   

# Spark.range

In [10]:
df_range = spark.range(0,10)

In [11]:
df_range.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [13]:

data = [
(1, "Alice", "Mumbai", "2023-01-15", True),
(2, "Bob", "Delhi", "2023-03-25", False),
(3, "Charlie", "Chennai", "2023-05-10", True)
]


columns = ["customer_id", "name", "city", "registration_date", "is_active"]

df_list= spark.createDataFrame(data,columns)
df_list.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----------+-------+-------+-----------------+---------+
|customer_id|   name|   city|registration_date|is_active|
+-----------+-------+-------+-----------------+---------+
|          1|  Alice| Mumbai|       2023-01-15|     true|
|          2|    Bob|  Delhi|       2023-03-25|    false|
|          3|Charlie|Chennai|       2023-05-10|     true|
+-----------+-------+-------+-----------------+---------+



                                                                                

# Homework - We can also impose the Schema by .schema

# RDD --- > DF


In [14]:
rdd = spark.sparkContext.parallelize([(1,'Alice'),(2,"Bob")])

In [16]:
rdd.take(2)

[(1, 'Alice'), (2, 'Bob')]

In [17]:
df_rdd = rdd.toDF(["customer_id", "name"])

In [18]:
df_rdd.show()

+-----------+-----+
|customer_id| name|
+-----------+-----+
|          1|Alice|
|          2|  Bob|
+-----------+-----+



In [19]:
spark.stop()