In [0]:
#Number of calls and total duration between 2 persons

In [0]:
from pyspark.sql.functions import col,sum,when,count

data = [
    (10,20,58),
    (20,10,12),
    (10,30,20),
    (30,40,100),
    (30,40,200),
    (30,40,200),
    (40,30,500)
]

df = spark.createDataFrame(data,["from_id","to_id","duration"])

df.display()

from_id,to_id,duration
10,20,58
20,10,12
10,30,20
30,40,100
30,40,200
30,40,200
40,30,500


In [0]:
#swap id's to make proper

swap_df = df.withColumn("person1",when(col("from_id")<col("to_id"),col("from_id")).otherwise(col("to_id"))) \
        .withColumn("person2",when(col("from_id")<col("to_id"),col("to_id")).otherwise(col("from_id"))) \
        .select("person1","person2","duration")

swap_df.display()

person1,person2,duration
10,20,58
10,20,12
10,30,20
30,40,100
30,40,200
30,40,200
30,40,500


In [0]:
#swap id's to make proper

"""
1.The expression when(col("from_id") < col("to_id"), "from_id") is problematic because "from_id" is treated as a literal string rather than referencing the column from_id.
"""

#In PySpark, conditions in functions like when must be expressed using column operations rather than raw Python boolean expressions.
#for expression when("from_id"<"to_id",col("to_id"):
"""
2.When using when, ensure that all conditions are expressed using col() or directly as column expressions. Here’s how to do it correctly:
Example: col("column_name") or df.column_name.
"""


Out[14]: '\n2.When using when, ensure that all conditions are expressed using col() or directly as column expressions. Here’s how to do it correctly:\nExample: col("column_name") or df.column_name.\n'

In [0]:

#count the call count and duration for each pair of persons

result_df = swap_df.groupBy("person1","person2") \
                   .agg(count("person1").alias("call_count"),sum("duration").alias("total_duration")) \
                    .select("person1","person2","call_count","total_duration")

result_df.display()

person1,person2,call_count,total_duration
10,20,2,70
10,30,1,20
30,40,4,1000


#using SPARK SQL

In [0]:
#convert dataframe to TempView

df.createOrReplaceTempView("calls")

In [0]:
%sql
select * from calls

from_id,to_id,duration
10,20,58
20,10,12
10,30,20
30,40,100
30,40,200
30,40,200
40,30,500


In [0]:
%sql

with cte1 as (
  select *,
  case when from_id < to_id then from_id else to_id end person1,
  case when from_id < to_id then to_id else from_id end person2
  from calls
)

select person1, person2, count(1) as call_count, sum(duration) as total_duration
from cte1
group by person1,person2

person1,person2,call_count,total_duration
10,20,2,70
10,30,1,20
30,40,4,1000


##SQL approach 2:

In [0]:
%sql

select 
least(from_id, to_id) as person1,
greatest(from_id, to_id) as person2,
count(*) as call_count,
sum(duration) as total_duration
from calls
group by 1,2

person1,person2,call_count,total_duration
10,20,2,70
10,30,1,20
30,40,4,1000
