In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 4]]
project = pd.DataFrame(data, columns=['project_id', 'employee_id']).astype({'project_id':'Int64', 'employee_id':'Int64'})
data = [[1, 'Khaled', 3], [2, 'Ali', 2], [3, 'John', 1], [4, 'Doe', 2]]
employee = pd.DataFrame(data, columns=['employee_id', 'name', 'experience_years']).astype({'employee_id':'Int64', 'name':'object', 'experience_years':'Int64'})

In [4]:
project = spark.createDataFrame(project)
project.show()

employee = spark.createDataFrame(employee)
employee.show()

+----------+-----------+
|project_id|employee_id|
+----------+-----------+
|         1|          1|
|         1|          2|
|         1|          3|
|         2|          1|
|         2|          4|
+----------+-----------+

+-----------+------+----------------+
|employee_id|  name|experience_years|
+-----------+------+----------------+
|          1|Khaled|               3|
|          2|   Ali|               2|
|          3|  John|               1|
|          4|   Doe|               2|
+-----------+------+----------------+



In [7]:
from pyspark.sql.functions import count, sum, col, round

project \
    .join(
        employee,
        'employee_id',
        'inner'
    ) \
.groupby('project_id') \
.agg(sum('experience_years').alias('total_years'), 
     count('employee_id').alias('total_employees')) \
.withColumn('average_years', 
            round(col('total_years') / col('total_employees'), 2)) \
.select(['project_id', 'average_years']) \
.show()


+----------+-------------+
|project_id|average_years|
+----------+-------------+
|         1|          2.0|
|         2|          2.5|
+----------+-------------+

