In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [['Dog', 'Golden Retriever', 1, 5], ['Dog', 'German Shepherd', 2, 5], ['Dog', 'Mule', 200, 1], ['Cat', 'Shirazi', 5, 2], ['Cat', 'Siamese', 3, 3], ['Cat', 'Sphynx', 7, 4]]
queries = pd.DataFrame(data, columns=['query_name', 'result', 'position', 'rating']).astype({'query_name':'object', 'result':'object', 'position':'Int64', 'rating':'Int64'})

In [5]:
queries = spark.createDataFrame(queries)
queries.show()

+----------+----------------+--------+------+
|query_name|          result|position|rating|
+----------+----------------+--------+------+
|       Dog|Golden Retriever|       1|     5|
|       Dog| German Shepherd|       2|     5|
|       Dog|            Mule|     200|     1|
|       Cat|         Shirazi|       5|     2|
|       Cat|         Siamese|       3|     3|
|       Cat|          Sphynx|       7|     4|
+----------+----------------+--------+------+



In [18]:
from pyspark.sql.functions import count, col, round, when

queries \
.withColumn('ratio', col('rating')/col('position')) \
.withColumn('poor_rating', when(col('rating') < 3, 1).otherwise(0)) \
.groupby('query_name') \
.agg(sum('poor_rating').alias('poor_rating'), 
     count('result').alias('total'),
     sum('ratio').alias('ratio')) \
.withColumn('quality', round(col('ratio') / col('total'), 2)) \
.withColumn('poor_query_percentage', round(col('poor_rating')/col('total') * 100, 2)) \
.select(['query_name', 'quality', 'poor_query_percentage']) \
.show()

+----------+-------+---------------------+
|query_name|quality|poor_query_percentage|
+----------+-------+---------------------+
|       Dog|    2.5|                33.33|
|       Cat|   0.66|                33.33|
+----------+-------+---------------------+

