In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
import pandas as pd

data = [[121, 'US', 'approved', 1000, '2018-12-18'], [122, 'US', 'declined', 2000, '2018-12-19'], [123, 'US', 'approved', 2000, '2019-01-01'], [124, 'DE', 'approved', 2000, '2019-01-07']]
transactions = pd.DataFrame(data, columns=['id', 'country', 'state', 'amount', 'trans_date']).astype({'id':'Int64', 'country':'object', 'state':'object', 'amount':'Int64', 'trans_date':'datetime64[ns]'})

In [5]:
transactions = spark.createDataFrame(transactions)
transactions.show()

+---+-------+--------+------+-------------------+
| id|country|   state|amount|         trans_date|
+---+-------+--------+------+-------------------+
|121|     US|approved|  1000|2018-12-18 00:00:00|
|122|     US|declined|  2000|2018-12-19 00:00:00|
|123|     US|approved|  2000|2019-01-01 00:00:00|
|124|     DE|approved|  2000|2019-01-07 00:00:00|
+---+-------+--------+------+-------------------+



In [14]:
from pyspark.sql.functions import col, when, date_format, count, sum

transactions \
.withColumn('approved_state', when(col('state') == 'approved', 1).otherwise(0)) \
.withColumn('approved_amount', col('approved_state') * col('amount')) \
.withColumn('month', date_format(col('trans_date'), "yyyy-MM")) \
.groupby(['month', 'country']) \
.agg(sum('approved_state').alias('approved_count'),
     sum('approved_amount').alias('approved_total_amount'),
     count('amount').alias('trans_count'),
     sum('amount').alias('trans_total_amount')) \
.show()

+-------+-------+--------------+---------------------+-----------+------------------+
|  month|country|approved_count|approved_total_amount|trans_count|trans_total_amount|
+-------+-------+--------------+---------------------+-----------+------------------+
|2018-12|     US|             1|                 1000|          2|              3000|
|2019-01|     US|             1|                 2000|          1|              2000|
|2019-01|     DE|             1|                 2000|          1|              2000|
+-------+-------+--------------+---------------------+-----------+------------------+

