In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('introduction').getOrCreate()

In [0]:
employees_data = [[1, 'mahendra', 30000, 40, 'IT'], [2, 'mahi', 40000, 50, 'DataSciecne'], 
                  [3, 'sam', 54000, 43, 'IT'], [4, 'ram', 23000,56, 'DataAnalytics'],[5, 'rajesh', 30000, 39, 'DataAnalytics']]

In [0]:
df= spark.createDataFrame(employees_data, ['id', 'name', 'salary', 'age','department'])

In [0]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df.columns

Out[30]: ['id', 'name', 'salary', 'age', 'department']

In [0]:
df.show()

+---+--------+------+---+-------------+
| id|    name|salary|age|   department|
+---+--------+------+---+-------------+
|  1|mahendra| 30000| 40|           IT|
|  2|    mahi| 40000| 50|  DataSciecne|
|  3|     sam| 54000| 43|           IT|
|  4|     ram| 23000| 56|DataAnalytics|
|  5|  rajesh| 30000| 39|DataAnalytics|
+---+--------+------+---+-------------+



In [0]:
df.rdd.getNumPartitions()

Out[32]: 8

In [0]:
print(df)

DataFrame[id: bigint, name: string, salary: bigint, age: bigint, department: string]


In [0]:
df.show(2)

+---+--------+------+---+-----------+
| id|    name|salary|age| department|
+---+--------+------+---+-----------+
|  1|mahendra| 30000| 40|         IT|
|  2|    mahi| 40000| 50|DataSciecne|
+---+--------+------+---+-----------+
only showing top 2 rows



In [0]:
df = df.repartition(5) 

In [0]:
df.rdd.getNumPartitions()

Out[37]: 5

In [0]:
df= df.coalesce(2)

In [0]:
df.rdd.getNumPartitions()

Out[41]: 2

In [0]:
from pyspark.sql.types import IntegerType, StringType, DoubleType

df = df.withColumn("Id", df["Id"].cast(IntegerType()))


In [0]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)



In [0]:
df = df.withColumn("salary", df["salary"].cast(IntegerType()))

In [0]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define the schema
schema = StructType([
    StructField("Id", IntegerType(), True), 
    StructField("Name", StringType(), True), 
    StructField("Salary", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True)
])


In [0]:
df = spark.createDataFrame(employees_data, schema)

In [0]:
df.show()

+---+--------+------+---+-------------+
| Id|    Name|Salary|age|   department|
+---+--------+------+---+-------------+
|  1|mahendra| 30000| 40|           IT|
|  2|    mahi| 40000| 50|  DataSciecne|
|  3|     sam| 54000| 43|           IT|
|  4|     ram| 23000| 56|DataAnalytics|
|  5|  rajesh| 30000| 39|DataAnalytics|
+---+--------+------+---+-------------+



In [0]:
df.createOrReplaceTempView('employees')

In [0]:
%sql
select * from employees

Id,Name,Salary,age,department
1,mahendra,30000,40,IT
2,mahi,40000,50,DataSciecne
3,sam,54000,43,IT
4,ram,23000,56,DataAnalytics
5,rajesh,30000,39,DataAnalytics


In [0]:
%sql
select * from employees

Id,Name,Salary,age,department
1,mahendra,30000,40,IT
2,mahi,40000,50,DataSciecne
3,sam,54000,43,IT
4,ram,23000,56,DataAnalytics
5,rajesh,30000,39,DataAnalytics


Databricks visualization. Run in Databricks to view.

In [0]:
df.rdd.getNumPartitions()

Out[63]: 8

In [0]:
df=df.repartition(3)

In [0]:
df.rdd.getNumPartitions()

Out[67]: 3

In [0]:
df=df.coalesce(2)

In [0]:
df.rdd.getNumPartitions()

Out[70]: 2

In [0]:
df.show()

+---+--------+------+---+-------------+
| Id|    Name|Salary|age|   department|
+---+--------+------+---+-------------+
|  2|    mahi| 40000| 50|  DataSciecne|
|  4|     ram| 23000| 56|DataAnalytics|
|  1|mahendra| 30000| 40|           IT|
|  3|     sam| 54000| 43|           IT|
|  5|  rajesh| 30000| 39|DataAnalytics|
+---+--------+------+---+-------------+



In [0]:
from pyspark.sql.types import *
from pyspark.sql import functions as f
from pyspark.sql.functions import *

In [0]:
customers = spark.read.csv('dbfs:/FileStore/Customers.csv', inferSchema=True, header=True)

In [0]:
customers.head()

Out[74]: Row(customer_id='hCT0x9JiGXBQ', customer_zip_code_prefix=58125, customer_city='varzea paulista', customer_state='SP')

In [0]:
customers.show()

+------------+------------------------+------------------+--------------+
| customer_id|customer_zip_code_prefix|     customer_city|customer_state|
+------------+------------------------+------------------+--------------+
|hCT0x9JiGXBQ|                   58125|   varzea paulista|            SP|
|PxA7fv9spyhx|                    3112|armacao dos buzios|            RJ|
|g3nXeJkGI0Qw|                    4119|           jandira|            SP|
|EOEsCQ6QlpIg|                   18212|        uberlandia|            MG|
|mVz5LO2Vd6cL|                   88868|          ilhabela|            SP|
|UkqnhxmX7YMP|                   25902|       porto uniao|            SC|
|85jiDiGSfhTu|                    4762|         guarulhos|            SP|
|gDdkaN8b9s1g|                   75870|        mogi-guacu|            SP|
|9Csx6oXlpLl1|                   69068|         bebedouro|            SP|
|gZTPKLPRnreg|                   20921|         sao paulo|            SP|
|TDEBQrhIwApB|                    1223

In [0]:
customers.count()

Out[86]: 89316

In [0]:
len(customers.columns)

Out[87]: 4

In [0]:
customers.createOrReplaceTempView("customers_data")

In [0]:
%sql
select * from customers_data

customer_id,customer_zip_code_prefix,customer_city,customer_state
hCT0x9JiGXBQ,58125,varzea paulista,SP
PxA7fv9spyhx,3112,armacao dos buzios,RJ
g3nXeJkGI0Qw,4119,jandira,SP
EOEsCQ6QlpIg,18212,uberlandia,MG
mVz5LO2Vd6cL,88868,ilhabela,SP
UkqnhxmX7YMP,25902,porto uniao,SC
85jiDiGSfhTu,4762,guarulhos,SP
gDdkaN8b9s1g,75870,mogi-guacu,SP
9Csx6oXlpLl1,69068,bebedouro,SP
gZTPKLPRnreg,20921,sao paulo,SP


In [0]:
customers.orderBy(f.desc("customer_zip_code_prefix")).show()

+------------+------------------------+--------------------+--------------+
| customer_id|customer_zip_code_prefix|       customer_city|customer_state|
+------------+------------------------+--------------------+--------------+
|7qqTmdyVGDMP|                   99990|         sao goncalo|            RJ|
|ZHjtMtaZiUx9|                   99990|         sao goncalo|            RJ|
|CtQYAzn1JrrE|                   99990|         sao goncalo|            RJ|
|XMrrIKxrnnsR|                   99980|           sao paulo|            SP|
|n8BqyzA7DwrF|                   99980|           sao paulo|            SP|
|SVoMuhaA5wJW|                   99980|           sao paulo|            SP|
|XqV2U8UDcDXZ|                   99970|            vermelho|            MG|
|Vwg3QX1Ym0xW|                   99960|           guarulhos|            SP|
|u1xLQAEOhaNw|                   99960|           guarulhos|            SP|
|yUaesnGpw2ff|                   99960|           guarulhos|            SP|
|BBT0uDEg6Mg

In [0]:
customers.orderBy(col("customer_zip_code_prefix").desc()).limit(4).show()

+------------+------------------------+-------------+--------------+
| customer_id|customer_zip_code_prefix|customer_city|customer_state|
+------------+------------------------+-------------+--------------+
|ZHjtMtaZiUx9|                   99990|  sao goncalo|            RJ|
|7qqTmdyVGDMP|                   99990|  sao goncalo|            RJ|
|CtQYAzn1JrrE|                   99990|  sao goncalo|            RJ|
|n8BqyzA7DwrF|                   99980|    sao paulo|            SP|
+------------+------------------------+-------------+--------------+



In [0]:
customers2 = customers.drop(col('customer_id'))

In [0]:
customers2.show()

+------------------------+------------------+--------------+
|customer_zip_code_prefix|     customer_city|customer_state|
+------------------------+------------------+--------------+
|                   58125|   varzea paulista|            SP|
|                    3112|armacao dos buzios|            RJ|
|                    4119|           jandira|            SP|
|                   18212|        uberlandia|            MG|
|                   88868|          ilhabela|            SP|
|                   25902|       porto uniao|            SC|
|                    4762|         guarulhos|            SP|
|                   75870|        mogi-guacu|            SP|
|                   69068|         bebedouro|            SP|
|                   20921|         sao paulo|            SP|
|                    1223|         itaperuna|            RJ|
|                   11703|             serra|            ES|
|                   32407|       sao goncalo|            RJ|
|                   1384

In [0]:
customers.show()

+------------+------------------------+------------------+--------------+
| customer_id|customer_zip_code_prefix|     customer_city|customer_state|
+------------+------------------------+------------------+--------------+
|hCT0x9JiGXBQ|                   58125|   varzea paulista|            SP|
|PxA7fv9spyhx|                    3112|armacao dos buzios|            RJ|
|g3nXeJkGI0Qw|                    4119|           jandira|            SP|
|EOEsCQ6QlpIg|                   18212|        uberlandia|            MG|
|mVz5LO2Vd6cL|                   88868|          ilhabela|            SP|
|UkqnhxmX7YMP|                   25902|       porto uniao|            SC|
|85jiDiGSfhTu|                    4762|         guarulhos|            SP|
|gDdkaN8b9s1g|                   75870|        mogi-guacu|            SP|
|9Csx6oXlpLl1|                   69068|         bebedouro|            SP|
|gZTPKLPRnreg|                   20921|         sao paulo|            SP|
|TDEBQrhIwApB|                    1223

In [0]:
orders=spark.read.csv('dbfs:/FileStore/Orders.csv', header=True, inferSchema=True)

In [0]:
orders.show()

+------------+------------+------------+------------------------+-------------------+-------------------------+-----------------------------+
|    order_id| customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_timestamp|order_estimated_delivery_date|
+------------+------------+------------+------------------------+-------------------+-------------------------+-----------------------------+
|Axfy13Hk4PIk|hCT0x9JiGXBQ|   delivered|     2017-10-22 18:57:54|2017-10-22 19:14:13|      2017-10-26 22:19:52|                   2017-11-09|
|v6px92oS8cLG|PxA7fv9spyhx|   delivered|     2018-06-20 21:40:31|2018-06-20 22:20:20|      2018-07-03 22:51:22|                   2018-07-24|
|Ulpf9skrhjfm|g3nXeJkGI0Qw|   delivered|     2018-02-16 16:19:31|2018-02-17 16:15:35|      2018-02-27 01:29:50|                   2018-03-08|
|bwJVWupf2keN|EOEsCQ6QlpIg|   delivered|     2018-08-18 18:04:29|2018-08-18 18:15:16|      2018-08-27 20:03:51|                   2018-09-19|
|Dd0Qn

In [0]:
orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_timestamp: timestamp (nullable = true)
 |-- order_estimated_delivery_date: date (nullable = true)



In [0]:
df.show()

+---+--------+------+---+-------------+
| Id|    Name|Salary|age|   department|
+---+--------+------+---+-------------+
|  2|    mahi| 40000| 50|  DataSciecne|
|  4|     ram| 23000| 56|DataAnalytics|
|  1|mahendra| 30000| 40|           IT|
|  3|     sam| 54000| 43|           IT|
|  5|  rajesh| 30000| 39|DataAnalytics|
+---+--------+------+---+-------------+



In [0]:
df.filter(df.Salary < 40000).show()

+---+--------+------+---+-------------+
| Id|    Name|Salary|age|   department|
+---+--------+------+---+-------------+
|  4|     ram| 23000| 56|DataAnalytics|
|  1|mahendra| 30000| 40|           IT|
|  5|  rajesh| 30000| 39|DataAnalytics|
+---+--------+------+---+-------------+



In [0]:
df.select("Id", "Name", "Salary", "department").show()

+---+--------+------+-------------+
| Id|    Name|Salary|   department|
+---+--------+------+-------------+
|  2|    mahi| 40000|  DataSciecne|
|  4|     ram| 23000|DataAnalytics|
|  1|mahendra| 30000|           IT|
|  3|     sam| 54000|           IT|
|  5|  rajesh| 30000|DataAnalytics|
+---+--------+------+-------------+



In [0]:
df.select("department", 'salary').groupBy("department").\
    agg(sum("salary").alias("total_salary")).show()

+-------------+------------+
|   department|total_salary|
+-------------+------------+
|DataAnalytics|       53000|
|  DataSciecne|       40000|
|           IT|       84000|
+-------------+------------+



In [0]:
df.write.format('csv').mode('overwrite').save("dbfs:/FileStore/semployees_data")

In [0]:
dff=spark.read.csv('dbfs:/FileStore/semployees_data')

In [0]:
dff.show()

+---+--------+-----+---+-------------+
|_c0|     _c1|  _c2|_c3|          _c4|
+---+--------+-----+---+-------------+
|  1|mahendra|30000| 40|           IT|
|  3|     sam|54000| 43|           IT|
|  5|  rajesh|30000| 39|DataAnalytics|
|  2|    mahi|40000| 50|  DataSciecne|
|  4|     ram|23000| 56|DataAnalytics|
+---+--------+-----+---+-------------+

