In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, to_timestamp

**PROBLEM 3**: List customers who ordered same product more than once in a month 

In [2]:
spark = SparkSession.builder.appName('learning').master('local').getOrCreate()
sc = spark.sparkContext

In [3]:
import pandas as pd
from pyspark.sql.types import StructType, StringType, StructField,IntegerType,FloatType, DateType
customer_pd_df = pd.read_excel('file:///home/saif/LFS/datasets/datasets_pavan/sales/customers.xlsx')

data_schema = StructType([
            StructField('CustomerKey',FloatType(),True ),
            StructField('Prefix',StringType(),True ),
            StructField('FirstName',StringType(),True ),
            StructField('LastName',StringType(),True ),
            StructField('BirthDate',DateType(),True ),
            StructField('MaritalStatus',StringType(),True ),
            StructField('Gender',StringType(),True ),
            StructField('AnnualIncome',FloatType(),True ),
            StructField('TotalChildren',FloatType(),True ),
            StructField('EducationLevel',StringType(),True ),
            StructField('Occupation',StringType(),True ),
            StructField('HomeOwner',StringType(),True )  
])


customer_df = spark.createDataFrame(customer_pd_df,data_schema)
customer_df.printSchema()
customer_df.show(5)

root
 |-- CustomerKey: float (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: date (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- AnnualIncome: float (nullable = true)
 |-- TotalChildren: float (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)

+-----------+------+---------+--------+----------+-------------+------+------------+-------------+--------------+------------+---------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|
+-----------+------+---------+--------+----------+-------------+------+------------+-------------+--------------+------------+---------+
|    11000.0|   MR.|      JON|    YANG|1966-04-08|            M|     M|     90000.0|

In [4]:
sales_df = spark.read.format('csv')\
          .option('header', 'True') \
          .option('inferSchema', 'True') \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/sales.csv')
sales_df.printSchema()
sales_df.show(5)

root
 |-- OrderDate: string (nullable = true)
 |-- StockDate: string (nullable = true)
 |-- OrderNumber: string (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- CustomerKey: integer (nullable = true)
 |-- TerritoryKey: integer (nullable = true)
 |-- OrderLineItem: integer (nullable = true)
 |-- OrderQuantity: integer (nullable = true)

+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
| OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
|01-01-2015| 9/21/2001|    SO45080|       332|      14657|           1|            1|            1|
|01-01-2015|12-05-2001|    SO45079|       312|      29255|           4|            1|            1|
|01-01-2015|10/29/2001|    SO45082|       350|      11455|           9|            1|            1|
|01-01-2015|11/16/2001|    SO45081|       338

In [5]:
products_df = spark.read.format('csv')\
          .option('header', 'True') \
          .option('inferSchema', 'True') \
          .load('file:///home/saif/LFS/datasets/datasets_pavan/sales/products.csv')
products_df.printSchema()
products_df.show(5)

root
 |-- ProductKey: integer (nullable = true)
 |-- ProductSubcategoryKey: integer (nullable = true)
 |-- ProductSKU: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- ModelName: string (nullable = true)
 |-- ProductDescription: string (nullable = true)
 |-- ProductColor: string (nullable = true)
 |-- ProductSize: string (nullable = true)
 |-- ProductStyle: string (nullable = true)
 |-- ProductCost: double (nullable = true)
 |-- ProductPrice: double (nullable = true)

+----------+---------------------+----------+--------------------+-------------------+--------------------+------------+-----------+------------+-----------+------------+
|ProductKey|ProductSubcategoryKey|ProductSKU|         ProductName|          ModelName|  ProductDescription|ProductColor|ProductSize|ProductStyle|ProductCost|ProductPrice|
+----------+---------------------+----------+--------------------+-------------------+--------------------+------------+-----------+------------+-----------+----

In [None]:
# PROBLEM 3: List customers who ordered same product more than once in a month

In [29]:
sales_df.select(
        year(to_timestamp(col('OrderDate'),'MM-dd-yyyy')).alias('OrderYear'),
        month(to_timestamp(col('OrderDate'),'MM-dd-yyyy')).alias('OrderMonth'),
        col('CustomerKey'),
        col('ProductKey'),
        col('OrderNumber')
        )\
        .groupby(col('OrderYear'),col('OrderMonth'),col('CustomerKey'),col('ProductKey'))\
        .count()\
        .where(col('count')>1)

+---------+----------+-----------+----------+-----+
|OrderYear|OrderMonth|CustomerKey|ProductKey|count|
+---------+----------+-----------+----------+-----+
|     null|      null|      11330|       530|    2|
|     null|      null|      11091|       214|    2|
|     null|      null|      13179|       477|    2|
|     null|      null|      11215|       529|    2|
|     null|      null|      11300|       528|    3|
+---------+----------+-----------+----------+-----+
only showing top 5 rows



In [34]:
agg_result = sales_df.select(
        year(to_timestamp(col('OrderDate'),'MM-dd-yyyy')).alias('OrderYear'),
        month(to_timestamp(col('OrderDate'),'MM-dd-yyyy')).alias('OrderMonth'),
        col('CustomerKey'),
        col('ProductKey'),
        col('OrderNumber')
        )\
        .groupby(col('OrderYear'),col('OrderMonth'),col('CustomerKey'),col('ProductKey'))\
        .count()\
        .where(col('count')>1)

result_df = agg_result.join(customer_df,customer_df.CustomerKey == agg_result.CustomerKey,'left')\
          .join(products_df, agg_result.ProductKey == products_df.ProductKey, 'left')\
          .select(col('Prefix'),col('FirstName'),col('LastName'), col('ProductName'))

result_df.show(5)

+------+---------+--------+--------------------+
|Prefix|FirstName|LastName|         ProductName|
+------+---------+--------+--------------------+
|  MRS.|     SARA|   BAKER| Patch Kit/8 Patches|
|   MR.|FREDERICK|  PRASAD|        AWC Logo Cap|
|   MR.|    ETHAN|  BRYANT|    Road Bottle Cage|
|   MR.|    ETHAN|  BRYANT|Water Bottle - 30...|
|   MR.|    LOUIS|      XU| Patch Kit/8 Patches|
+------+---------+--------+--------------------+
only showing top 5 rows

