## Crete a DF

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("demo").getOrCreate()

data = [('ali',13),('ram',22),('sonu',25)]
columns = ['name','age']

df = spark.createDataFrame(data,columns)
df.show()

+----+---+
|name|age|
+----+---+
| ali| 13|
| ram| 22|
|sonu| 25|
+----+---+



In [20]:
df.selectExpr("Name", "Age + 1 as AgePlusOne").show()



+----+----------+
|Name|AgePlusOne|
+----+----------+
| ali|        14|
| ram|        23|
|sonu|        26|
+----+----------+



In [None]:
df.selectExpr("Name","Age", "Age + 1 as AgePlusOne").show()


### Creating a tempview using SQL

In [21]:
df.createOrReplaceTempView("example")
spark.sql("select * from example where age>15").show()

+----+---+
|name|age|
+----+---+
| ram| 22|
|sonu| 25|
+----+---+



## Reading from csv


In [48]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Demo").getOrCreate()

file_path = "../data/input_real_estate.txt"
df = spark.read.option('delimiter','|').csv(file_path,header=True,inferSchema=True)
df.show()


+-----------+--------------+------+--------+---------+----+-----------+----------+
|Property_ID|      Location| Price|Bedrooms|Bathrooms|Size|Price_SQ_FT|    Status|
+-----------+--------------+------+--------+---------+----+-----------+----------+
|    1461262| Arroyo Grande|795000|       3|        3|2371|      365.3|Short Sale|
|    1478004|   Paulo Pablo|399000|       4|        3|2818|     163.59|Short Sale|
|    1486551|   Paulo Pablo|545000|       4|        3|3032|     179.75|Short Sale|
|    1492832|     Santa Bay|909000|       4|        4|3540|     286.78|Short Sale|
|    1499102|Thomas Country|109900|       3|        1|1249|      98.99|Short Sale|
|    1489132|Thomas Country|109000|       2|        1|1129|      93.99|Short Sale|
|    1467262|    Fort Worth|987000|       4|        3|2771|      465.3|Short Sale|
|    1478114|   Paulo Pablo|409000|       4|        3|2918|     223.19|Short Sale|
|    1402551|     Nashville|545000|       4|        3|2932|     169.75|Short Sale|
|   

In [14]:
file_path = "../record.csv"
df = spark.read.csv(file_path)
df.show()
coalesce_df = df.coalesce(1)
coalesce_df.write.csv("../data/record.csv")


+------+-----+--------+---+---+
|   _c0|  _c1|     _c2|_c3|_c4|
+------+-----+--------+---+---+
|Robert| null|Williams|  3|  M|
| Rajiv| Mary|   Kumar|  5|  F|
|Oliver|Queen|    null|  2|  M|
| Berry| null|   Allen|  1|  M|
|  Tony| null|   Stark|  4|  F|
+------+-----+--------+---+---+



## diffrence in groupby and reduceby 

In [1]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName("GroupByExample").getOrCreate()

In [11]:
data = [("Alice","HR",3000),('Bob', "Engineering", 4000), ('carol','HR',3500),('David','Engineering',4500)]
columns = ['Name', 'Department', 'Salary']

In [12]:
df = spark.createDataFrame(data,columns)
df.show()


+-----+-----------+------+
| Name| Department|Salary|
+-----+-----------+------+
|Alice|         HR|  3000|
|  Bob|Engineering|  4000|
|carol|         HR|  3500|
|David|Engineering|  4500|
+-----+-----------+------+



In [13]:

# Group by Department and compute average salary
grouped_df = df.groupBy("Department").avg("Salary")
grouped_df.show()


+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|         HR|     3250.0|
|Engineering|     4250.0|
+-----------+-----------+



In [14]:
max_df = df.groupBy("Department").max("Salary")
max_df.show()

+-----------+-----------+
| Department|max(Salary)|
+-----------+-----------+
|         HR|       3500|
|Engineering|       4500|
+-----------+-----------+



In [19]:
from pyspark.sql.functions import max
max_df = df.groupBy("Department").agg(max("Salary").alias("max_s"))
max_df.show()

+-----------+-----+
| Department|max_s|
+-----------+-----+
|         HR| 3500|
|Engineering| 4500|
+-----------+-----+



## Printing the Schema


In [2]:
df.printSchema()

root
 |-- Property_ID: integer (nullable = true)
 |-- Location: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Bedrooms: integer (nullable = true)
 |-- Bathrooms: integer (nullable = true)
 |-- Size: integer (nullable = true)
 |-- Price_SQ_FT: double (nullable = true)
 |-- Status: string (nullable = true)



## Selecting a column

In [6]:
df.select("Price").show()

+------+
| Price|
+------+
|795000|
|399000|
|545000|
|909000|
|109900|
|109000|
|987000|
|409000|
|545000|
|980000|
|119900|
|129000|
|798000|
|789000|
|896000|
|987000|
|219900|
|107200|
+------+



## Filter a row

In [8]:
df.filter(df['Price']>500000).show()

+-----------+-------------+------+--------+---------+----+-----------+----------+
|Property_ID|     Location| Price|Bedrooms|Bathrooms|Size|Price_SQ_FT|    Status|
+-----------+-------------+------+--------+---------+----+-----------+----------+
|    1461262|Arroyo Grande|795000|       3|        3|2371|      365.3|Short Sale|
|    1486551|  Paulo Pablo|545000|       4|        3|3032|     179.75|Short Sale|
|    1492832|    Santa Bay|909000|       4|        4|3540|     286.78|Short Sale|
|    1467262|   Fort Worth|987000|       4|        3|2771|      465.3|Short Sale|
|    1402551|    Nashville|545000|       4|        3|2932|     169.75|Short Sale|
|    1405832|     San Jose|980000|       4|        4|3340|     290.98|Short Sale|
|    1469062|Arroyo Grande|798000|       3|        4|2321|      235.9|Short Sale|
|    1498004|    Nashville|789000|       4|        3|2419|     263.59|Short Sale|
|    1586751|    Nashville|896000|       4|        3|3132|     199.75|Short Sale|
|    1433232|   

## Group By Aggregation
<u>count()</u>
<u>countDistinct()</u>
<u>sum()</u>
<u>avg()</u>
<u>min()</u>
<u>max()</u>
<u>last()</u>
<u>variance()</u>
<u>stddev()</u>
<u>skewness()</u>
<u>kurtosis()</u>
<u>approx_count_distinct()</u>
<u>collect_list()</u>
<u>collect_set()</u>





In [27]:
df.groupBy("Location").agg({'price':"sum"}).show()

+--------------+----------+
|      Location|sum(price)|
+--------------+----------+
|     Nashville|   2230000|
|   Paulo Pablo|   1353000|
| Arroyo Grande|   1593000|
|      Glendale|    987000|
|     Santa Bay|    909000|
|      San Jose|   1087200|
|Thomas Country|    347900|
|    Fort Worth|   1326800|
+--------------+----------+



In [26]:
df.groupBy("Location").agg({"Bathrooms":"count"}).show()

+--------------+----------------+
|      Location|count(Bathrooms)|
+--------------+----------------+
|     Nashville|               3|
|   Paulo Pablo|               3|
| Arroyo Grande|               2|
|      Glendale|               1|
|     Santa Bay|               1|
|      San Jose|               2|
|Thomas Country|               3|
|    Fort Worth|               3|
+--------------+----------------+



In [33]:
# df.select(countDistinct("Bathrooms").alias("dist_count")).show()
df.groupBy("Location").agg({"Bathrooms":"count"}).show()



+--------------+----------------+
|      Location|count(Bathrooms)|
+--------------+----------------+
|     Nashville|               3|
|   Paulo Pablo|               3|
| Arroyo Grande|               2|
|      Glendale|               1|
|     Santa Bay|               1|
|      San Jose|               2|
|Thomas Country|               3|
|    Fort Worth|               3|
+--------------+----------------+



In [40]:
from pyspark.sql.functions import last

last_df = df.select(last("Bathrooms").alias("last_bath"))
last_df.show()


+---------+
|last_bath|
+---------+
|        1|
+---------+



In [41]:
from pyspark.sql.functions import variance
df.select(variance('Price')).show()

+--------------------+
|     var_samp(Price)|
+--------------------+
|1.230593021241830...|
+--------------------+



In [44]:
from pyspark.sql.functions import stddev
df.select(stddev('Price').alias("deviation_in_price")).show()

+------------------+
|deviation_in_price|
+------------------+
| 350798.0931022616|
+------------------+



In [46]:
from pyspark.sql.functions import skewness
df.select(skewness('Price')).show()

+--------------------+
|     skewness(Price)|
+--------------------+
|-0.07496267236398088|
+--------------------+



In [51]:
from pyspark.sql.functions import collect_list,collect_set
df.groupBy('Location').agg(collect_list('Price').alias("price_range")).show(truncate=False)
df.groupBy('Location').agg(collect_set('Price').alias("price_range")).show(truncate=False)


+--------------+------------------------+
|Location      |price_range             |
+--------------+------------------------+
|Nashville     |[545000, 789000, 896000]|
|Paulo Pablo   |[399000, 545000, 409000]|
|Arroyo Grande |[795000, 798000]        |
|Glendale      |[987000]                |
|Santa Bay     |[909000]                |
|San Jose      |[980000, 107200]        |
|Thomas Country|[109900, 109000, 129000]|
|Fort Worth    |[987000, 119900, 219900]|
+--------------+------------------------+

+--------------+------------------------+
|Location      |price_range             |
+--------------+------------------------+
|Nashville     |[545000, 896000, 789000]|
|Paulo Pablo   |[545000, 399000, 409000]|
|Arroyo Grande |[798000, 795000]        |
|Glendale      |[987000]                |
|Santa Bay     |[909000]                |
|San Jose      |[980000, 107200]        |
|Thomas Country|[109900, 109000, 129000]|
|Fort Worth    |[987000, 119900, 219900]|
+--------------+-----------------

In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_list

# Initialize SparkSession
spark = SparkSession.builder.appName("collect_list Example").getOrCreate()

# Create a DataFrame with sample data
data = [("Alice", 1), ("Bob", 2), ("Alice", 3), ("Bob", 4)]
df = spark.createDataFrame(data, ["name", "value"])

# Using collect_list to aggregate values
collect_list_df = df.groupBy("name").agg(collect_list("value").alias("value_list"))
collect_list_df.show()


+-----+----------+
| name|value_list|
+-----+----------+
|Alice|    [1, 3]|
|  Bob|    [2, 4]|
+-----+----------+

