In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

**PROBLEM 2**: Fetch total orders made by married male customers belonging to Central America occupied in a clerical job

In [2]:
spark = SparkSession.builder.appName('learning').master('local')\
                    .config('spark.jars','/home/saif/LFS/jars/spark-xml_2.12-0.5.0.jar')\
                    .getOrCreate()
sc = spark.sparkContext
# Note for the 1st Method config option is not required

## Method 1
## Reading excel file Using Pandas then converting it to Data frame

In [3]:
import pandas as pd

In [4]:
customer_pd_df = pd.read_excel('file:///home/saif/LFS/datasets/datasets_pavan/sales/customers.xlsx')

In [5]:
customer_pd_df.head(5)

Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner
0,11000.0,MR.,JON,YANG,1966-04-08,M,M,90000.0,2.0,Bachelors,Professional,Y
1,11001.0,MR.,EUGENE,HUANG,1965-05-14,S,M,60000.0,3.0,Bachelors,Professional,N
2,11002.0,MR.,RUBEN,TORRES,1965-08-12,M,M,60000.0,3.0,Bachelors,Professional,Y
3,11003.0,MS.,CHRISTY,ZHU,1968-02-15,S,F,70000.0,0.0,Bachelors,Professional,N
4,11004.0,MRS.,ELIZABETH,JOHNSON,1968-08-08,S,F,80000.0,5.0,Bachelors,Professional,Y


In [6]:
customer_pd_df['Occupation'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Professional
1        Professional
2        Professional
3        Professional
4        Professional
             ...     
18143        Clerical
18144        Clerical
18145        Clerical
18146        Clerical
18147        Clerical
Name: Occupation, Length: 18148, dtype: object>

In [7]:
customer_pd_df.dtypes

CustomerKey              float64
Prefix                    object
FirstName                 object
LastName                  object
BirthDate         datetime64[ns]
MaritalStatus             object
Gender                    object
AnnualIncome             float64
TotalChildren            float64
EducationLevel            object
Occupation                object
HomeOwner                 object
dtype: object

In [8]:
from pyspark.sql.types import StructType, StringType, StructField,IntegerType,FloatType, DateType

In [9]:
data_schema = StructType([
            StructField('CustomerKey',FloatType(),True ),
            StructField('Prefix',StringType(),True ),
            StructField('FirstName',StringType(),True ),
            StructField('LastName',StringType(),True ),
            StructField('BirthDate',DateType(),True ),
            StructField('MaritalStatus',StringType(),True ),
            StructField('Gender',StringType(),True ),
            StructField('AnnualIncome',FloatType(),True ),
            StructField('TotalChildren',FloatType(),True ),
            StructField('EducationLevel',StringType(),True ),
            StructField('Occupation',StringType(),True ),
            StructField('HomeOwner',StringType(),True )  
])

In [10]:
customer_df = spark.createDataFrame(customer_pd_df,data_schema)
customer_df.printSchema()
customer_df.show(5)

root
 |-- CustomerKey: float (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- AnnualIncome: float (nullable = true)
 |-- TotalChildren: float (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)

+-----------+------+---------+--------+----------+-------------+------+------------+-------------+--------------+------------+---------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|
+-----------+------+---------+--------+----------+-------------+------+------------+-------------+--------------+------------+---------+
|    11000.0|   MR.|      JON|    YANG|1966-04-08|            M|     M|     90000.0|

## Method 2 Using a liberary 

==============================================================================================

In [10]:
data_schema = StructType([
            StructField('CustomerKey',FloatType(),True ),
            StructField('Prefix',StringType(),True ),
            StructField('FirstName',StringType(),True ),
            StructField('LastName',StringType(),True ),
            StructField('BirthDate',DateType(),True ),
            StructField('MaritalStatus',StringType(),True ),
            StructField('Gender',StringType(),True ),
            StructField('AnnualIncome',FloatType(),True ),
            StructField('TotalChildren',FloatType(),True ),
            StructField('EducationLevel',StringType(),True ),
            StructField('Occupation',StringType(),True ),
            StructField('HomeOwner',StringType(),True )  
])

In [16]:
customer_df=spark.read.format("com.databricks.spark.xml") \
          .option('header', 'True') \
          .option('inferSchema','True')\
          .schema(data_schema) \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/customers.xlsx')

customer_df.printSchema()
customer_df.show(5)

root
 |-- CustomerKey: float (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- AnnualIncome: float (nullable = true)
 |-- TotalChildren: float (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)

+-----------+------+---------+--------+---------+-------------+------+------------+-------------+--------------+----------+---------+
|CustomerKey|Prefix|FirstName|LastName|BirthDate|MaritalStatus|Gender|AnnualIncome|TotalChildren|EducationLevel|Occupation|HomeOwner|
+-----------+------+---------+--------+---------+-------------+------+------------+-------------+--------------+----------+---------+
+-----------+------+---------+--------+---------+-------------+------+------------+----------

==============================================================================================

In [11]:
orders_returns_df = spark.read.format('csv')\
          .option('header', 'True') \
          .option('inferSchema', 'True') \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/order_returns.csv')
orders_returns_df.printSchema()
orders_returns_df.show(5)

root
 |-- ReturnDate: string (nullable = true)
 |-- TerritoryKey: integer (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- ReturnQuantity: integer (nullable = true)

+----------+------------+----------+--------------+
|ReturnDate|TerritoryKey|ProductKey|ReturnQuantity|
+----------+------------+----------+--------------+
| 1/18/2015|           9|       312|             1|
| 1/18/2015|          10|       310|             1|
| 1/21/2015|           8|       346|             1|
| 1/22/2015|           4|       311|             1|
|  2/2/2015|           6|       312|             1|
+----------+------------+----------+--------------+
only showing top 5 rows



In [12]:
territories_df = spark.read.format('csv')\
          .option('header', 'True') \
          .option('inferSchema', 'True') \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/territories.csv')
territories_df.printSchema()
territories_df.show(5)


root
 |-- SalesTerritoryKey: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Continent: string (nullable = true)

+-----------------+---------+-------------+-------------+
|SalesTerritoryKey|   Region|      Country|    Continent|
+-----------------+---------+-------------+-------------+
|                1|Northwest|United States|North America|
|                2|Northeast|United States|North America|
|                3|  Central|United States|North America|
|                4|Southwest|United States|North America|
|                5|Southeast|United States|North America|
+-----------------+---------+-------------+-------------+
only showing top 5 rows



In [13]:
territories_df.select("Continent").distinct().show()

+-------------+
|    Continent|
+-------------+
|       Europe|
|North America|
|      Pacific|
+-------------+



In [14]:
territories_df.select("Region").distinct().show()

+--------------+
|        Region|
+--------------+
|       Germany|
|        France|
|     Northwest|
|     Southeast|
|       Central|
|        Canada|
|     Southwest|
|     Australia|
|United Kingdom|
|     Northeast|
+--------------+



In [16]:
sales_df = spark.read.format('csv')\
          .option('header', 'True') \
          .option('inferSchema', 'True') \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/sales.csv')
sales_df.printSchema()
sales_df.show(5)

root
 |-- OrderDate: string (nullable = true)
 |-- StockDate: string (nullable = true)
 |-- OrderNumber: string (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- CustomerKey: integer (nullable = true)
 |-- TerritoryKey: integer (nullable = true)
 |-- OrderLineItem: integer (nullable = true)
 |-- OrderQuantity: integer (nullable = true)

+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
| OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
|01-01-2015| 9/21/2001|    SO45080|       332|      14657|           1|            1|            1|
|01-01-2015|12-05-2001|    SO45079|       312|      29255|           4|            1|            1|
|01-01-2015|10/29/2001|    SO45082|       350|      11455|           9|            1|            1|
|01-01-2015|11/16/2001|    SO45081|       338

In [None]:
(customer_df.where((col('MaritalStatus') == 'M') & (col('Gender') == 'M'))).join

In [32]:
result_df = sales_df.join(customer_df, customer_df.CustomerKey == sales_df.CustomerKey, 'inner')\
        .join(territories_df, sales_df.TerritoryKey == territories_df.SalesTerritoryKey , 'inner')\
        .where( 
        (customer_df.MaritalStatus == 'M') & (customer_df.Gender == 'M') & 
        (customer_df.Occupation == 'Clerical')  
        & (territories_df.Country == 'United States')
        #& (territories_df.Region == 'Central')
        )\
        .select(customer_df.MaritalStatus,customer_df.Gender,customer_df.Occupation,territories_df.Continent,
               sales_df.OrderNumber)
#         .groupby(sales_df.OrderNumber)\
#         .count(sales_df.OrderNumber)\
        
# Melwin Please Note when I add the condition   (territories_df.Region == 'Central') I am getting empty data set  
# as per the Question we need to add them 

In [33]:
result_df.select(col('OrderNumber')).groupby('OrderNumber').count().show()

+-----------+-----+
|OrderNumber|count|
+-----------+-----+
|    SO67006|    4|
|    SO51242|    3|
|    SO66963|    3|
|    SO57642|    3|
|    SO70660|    4|
|    SO61289|    3|
|    SO67984|    3|
|    SO57440|    3|
|    SO48526|    1|
|    SO64113|    3|
|    SO73559|    3|
|    SO51552|    3|
|    SO65630|    1|
|    SO73465|    2|
|    SO73377|    2|
|    SO60533|    1|
|    SO56391|    2|
|    SO66219|    3|
|    SO68973|    4|
|    SO73152|    4|
+-----------+-----+
only showing top 20 rows

