<h1>Data Frame<h1>

In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sparkConf =  SparkConf().setAppName("Trying").setMaster("local[*]")
sc = SparkContext(conf=sparkConf)

In [3]:
spark = SparkSession.builder.appName("Trying").master("local[*]").getOrCreate()

<h3>Convert the Rdd's to DF</h3>

#### Using toDF()

In [16]:
# Create DataFrame from Data sources
rdd1 = sc.textFile("file:///home/saif/LFS/datasets/emp.txt")
print(rdd1.collect())
type(rdd1)

['id,name,city', '101,saif,mumbai', '102,mitali,pune', '103,ram,balewadi']


pyspark.rdd.RDD

In [17]:
# using toDF()
rdd1 = rdd1.filter(lambda x: x != 'id,name,city' )
rdd1 = rdd1.map(lambda x: x.split(','))
df = rdd1.toDF(['id','name','city'])
df.printSchema()
df.show(truncate=False) 

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)

+---+------+--------+
|id |name  |city    |
+---+------+--------+
|101|saif  |mumbai  |
|102|mitali|pune    |
|103|ram   |balewadi|
+---+------+--------+



In [27]:
dept = [("Finance", 10),
            ("Marketing", 20),
            ("Sales", 30),
            ("IT", 40)]
rdd = sc.parallelize(dept)
df = rdd.toDF(["dept", "deptno"])
df.printSchema()
df.show(truncate=False) 

root
 |-- dept: string (nullable = true)
 |-- deptno: long (nullable = true)

+---------+------+
|dept     |deptno|
+---------+------+
|Finance  |10    |
|Marketing|20    |
|Sales    |30    |
|IT       |40    |
+---------+------+



#### using createDataFramee(data=, schema = ) 

In [21]:
deptdata = [("Finance", 10),
            ("Marketing", 20),
            ("Sales", 30),
            ("IT", 40)]
dataSchema = ["dept", "deptno"]
df = spark.createDataFrame(data=deptdata, schema = dataSchema) 
df.printSchema()
df.show(truncate=False) 

root
 |-- dept: string (nullable = true)
 |-- deptno: long (nullable = true)

+---------+------+
|dept     |deptno|
+---------+------+
|Finance  |10    |
|Marketing|20    |
|Sales    |30    |
|IT       |40    |
+---------+------+



<h3>Reading the df from the source</h3>

In [24]:
df = spark.read.format('csv')\
            .option('delimiter','|') \
            .option('header', 'True') \
            .option('inferSchema', 'True') \
            .load('file:///home/saif/LFS/datasets/emp_all.txt')
# for HDFS hdfs://localhost:9000/user/saif/HFS/Output/....
df.show(5,truncate=False)
df.printSchema()

+---+--------+-------+
|id |name,sal|country|
+---+--------+-------+
|101|sohail  |1000   |
|102|Saif    |2000   |
|103|Mitali  |3000   |
|104|Manas   |4000   |
|105|Ram     |5000   |
+---+--------+-------+
only showing top 5 rows

root
 |-- id: integer (nullable = true)
 |-- name,sal: string (nullable = true)
 |-- country: integer (nullable = true)



<h3>Save the DF to a file <br><br> Write the df </h3>

In [25]:
df.write.format('csv')\
        .mode('append')\
        .save('file:///home/saif/LFS/datasets/emp_all_write.txt')
# for HDFS hdfs://localhost:9000/user/saif/HFS/Output/....

<h3>Saving modes:</h3>
<b>PySpark DataFrameWriter also has a method mode ( ) to specify saving mode.</b>
<ul>
    <li><b>error:</b> This is a default option when the file already exists, it returns an error.</li>
    <li><b>ignore:</b> Ignores write operation when the file already exists.</li>
    <li><b>append:</b> To add the data to the existing file.</li>
    <li><b>overwrite:</b> This mode is used to overwrite the existing file.</li>
</ul>

<h3>Select</h3>

In [None]:
# Single & Multiple Columns:
df.select("firstname").show()
df.select("firstname", "lastname").show()
# Using Dataframe object name:
df.select(df.firstname, df.lastname).show()
# Using col function:
df.select(col("firstname"), col("lastname")).show() 

### Melwin you shouls look into 
## DataFrame Partitions & Executors:

<h2>*****Spark Jobs, Stages & Tasks *******</h2>
<a href="https://www.udemy.com/course/apache-spark-programming-in-python-for-beginners/learn/lecture/20192582#content">link Udemy (login with your account)</a>

## **** Data Frame Partitions and Executors *****
<a href="https://www.udemy.com/course/apache-spark-programming-in-python-for-beginners/learn/lecture/20192576#content">link Udemy (login with your account)</a>

### Melwin Remember every action has at least one Job and every job has at least one stage and each stage has at least one task
### Task is depends on the Partition if the n is the partition then n*task will be there for the respective Job
### Look at the video to understand

In [22]:
def add_to_list(item,list_param=[]):
    list_param.append(item)
    return list_param

def add_to_list2(item,list_param=None):
    if not list_param:
        list_param=[] #
    list_param.append(item)
    return list_param
 
master=list()
master.append(add_to_list(4))
master.append(add_to_list2(5))
master.append(add_to_list(8))
master.append(add_to_list2(10))

print(master)

[[4, 8], [5], [4, 8], [10]]
