## Crete a DF

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("demo").getOrCreate()

data = [('ali',13),('ram',22),('sonu',25)]
columns = ['name','age']

df = spark.createDataFrame(data,columns)
df.show()

+----+---+
|name|age|
+----+---+
| ali| 13|
| ram| 22|
|sonu| 25|
+----+---+



In [20]:
df.selectExpr("Name", "Age + 1 as AgePlusOne").show()



+----+----------+
|Name|AgePlusOne|
+----+----------+
| ali|        14|
| ram|        23|
|sonu|        26|
+----+----------+



### Creating a tempview using SQL

In [21]:
df.createOrReplaceTempView("example")
spark.sql("select * from example where age>15").show()

+----+---+
|name|age|
+----+---+
| ram| 22|
|sonu| 25|
+----+---+



## Reading from csv


In [15]:
file_path = "../data/input_real_estate.txt"
df = spark.read.option('delimiter','|').csv(file_path,header=True,inferSchema=True)
df.show()


+-----------+--------------+------+--------+---------+----+-----------+----------+
|Property_ID|      Location| Price|Bedrooms|Bathrooms|Size|Price_SQ_FT|    Status|
+-----------+--------------+------+--------+---------+----+-----------+----------+
|    1461262| Arroyo Grande|795000|       3|        3|2371|      365.3|Short Sale|
|    1478004|   Paulo Pablo|399000|       4|        3|2818|     163.59|Short Sale|
|    1486551|   Paulo Pablo|545000|       4|        3|3032|     179.75|Short Sale|
|    1492832|     Santa Bay|909000|       4|        4|3540|     286.78|Short Sale|
|    1499102|Thomas Country|109900|       3|        1|1249|      98.99|Short Sale|
|    1489132|Thomas Country|109000|       2|        1|1129|      93.99|Short Sale|
|    1467262|    Fort Worth|987000|       4|        3|2771|      465.3|Short Sale|
|    1478114|   Paulo Pablo|409000|       4|        3|2918|     223.19|Short Sale|
|    1402551|     Nashville|545000|       4|        3|2932|     169.75|Short Sale|
|   

In [14]:
file_path = "../record.csv"
df = spark.read.csv(file_path)
df.show()
coalesce_df = df.coalesce(1)
coalesce_df.write.csv("../data/record.csv")


+------+-----+--------+---+---+
|   _c0|  _c1|     _c2|_c3|_c4|
+------+-----+--------+---+---+
|Robert| null|Williams|  3|  M|
| Rajiv| Mary|   Kumar|  5|  F|
|Oliver|Queen|    null|  2|  M|
| Berry| null|   Allen|  1|  M|
|  Tony| null|   Stark|  4|  F|
+------+-----+--------+---+---+



## diffrence in groupby and reduceby 

In [1]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName("GroupByExample").getOrCreate()

In [11]:
data = [("Alice","HR",3000),('Bob', "Engineering", 4000), ('carol','HR',3500),('David','Engineering',4500)]
columns = ['Name', 'Department', 'Salary']

In [12]:
df = spark.createDataFrame(data,columns)
df.show()


+-----+-----------+------+
| Name| Department|Salary|
+-----+-----------+------+
|Alice|         HR|  3000|
|  Bob|Engineering|  4000|
|carol|         HR|  3500|
|David|Engineering|  4500|
+-----+-----------+------+



In [13]:

# Group by Department and compute average salary
grouped_df = df.groupBy("Department").avg("Salary")
grouped_df.show()


+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|         HR|     3250.0|
|Engineering|     4250.0|
+-----------+-----------+



In [14]:
max_df = df.groupBy("Department").max("Salary")
max_df.show()

+-----------+-----------+
| Department|max(Salary)|
+-----------+-----------+
|         HR|       3500|
|Engineering|       4500|
+-----------+-----------+



In [19]:
from pyspark.sql.functions import max
max_df = df.groupBy("Department").agg(max("Salary").alias("max_s"))
max_df.show()

+-----------+-----+
| Department|max_s|
+-----------+-----+
|         HR| 3500|
|Engineering| 4500|
+-----------+-----+

