In [4]:
from pyspark.sql import SparkSession

# Create a SparkSession with a proper application name
spark = SparkSession.builder.appName("StudentDataFrame").getOrCreate()

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Sample student data
data_student = [
    ('mahesh', 'Male', 'Physics', 80, 'P', 90),
    ('mahesh', 'Male', 'Chemistry', 67, 'P', 90),
    ('mahesh', 'Male', 'Mathematics', 67, 'P', 90),
    ('Shruthi', 'Female', 'Physics', 60, 'P', 86),
    ('shruthi', 'female', 'Chemistry', 67, 'P', 86),
    ('shruthi', 'female', 'Mathematics', 52, 'P', 86),
    ('Prachi', 'female', 'Physics', 72, 'P', 72),
    ('Prachi', 'female', 'Chemistry', 80, 'P', 72),
    ('Prachi', 'female', 'Mathematics', 90, 'P', 72),
    ('Vaishanavi', 'female', 'Physics', 50, 'P', 70),
    ('Vaishanavi', 'female', 'Chemistry', 86, 'P', 70),
    ('Vaishanavi', 'female', 'Mathematics', 70, 'P', 70),
    ('Avanti', 'female', 'Physics', 77, 'P', 90),
    ('Avanti', 'female', 'Chemistry', 95, 'P', 90),
    ('Avanti', 'female', 'Mathematics', 78, 'P', 90)
]

# Define schema using StructField for explicit data types
schema1 = StructType([
    StructField("name", StringType(), True),  # String for name
    StructField("gender", StringType(), True),  # String for gender
    StructField("subject", StringType(), True),  # String for subject
    StructField("mark", IntegerType(), True),  # Integer for mark
    StructField("status", StringType(), True),  # String for status
    StructField("attendance", IntegerType(), True)  # Integer for attendance
])

# Create DataFrame with schema
studf = spark.createDataFrame(data_student, schema1)

# Show the DataFrame
studf.show()

TypeError: 'JavaPackage' object is not callable

In [8]:
# Create rank within each group of name

from pyspark.sql.window import Window
from pyspark.sql.functions import col,row_number

windowdept = Window.partitionBy('name').orderBy(col('mark').desc())

df2 = studf.withColumn('row',row_number().over(windowdept)).orderBy('name','row')
df2.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  1|
|    Avanti|female|Mathematics|  78|     P|        90|  2|
|    Avanti|female|    Physics|  77|     P|        90|  3|
|    Prachi|female|Mathematics|  90|     P|        72|  1|
|    Prachi|female|  chemistry|  80|     P|        72|  2|
|    Prachi|female|    Physics|  72|     P|        72|  3|
|   Shruthi|Female|    Physics|  60|     P|        86|  1|
|Vaishanavi|female|  chemistry|  86|     P|        70|  1|
|Vaishanavi|female|Mathematics|  70|     P|        70|  2|
|Vaishanavi|female|    Physics|  50|     P|        70|  3|
|    mahesh|  Male|    Physics|  80|     P|        90|  1|
|    mahesh|  Male|  chemistry|  67|     P|        90|  2|
|    mahesh|  Male|Mathematics|  67|     P|        90|  3|
|   shruthi|female|  Chemistry|  67|     P|        86|  

In [9]:
# get top N rows per group of name

df3 = df2.filter(col("row") <= 1)
df3.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  1|
|    Prachi|female|Mathematics|  90|     P|        72|  1|
|   Shruthi|Female|    Physics|  60|     P|        86|  1|
|Vaishanavi|female|  chemistry|  86|     P|        70|  1|
|    mahesh|  Male|    Physics|  80|     P|        90|  1|
|   shruthi|female|  Chemistry|  67|     P|        86|  1|
+----------+------+-----------+----+------+----------+---+



In [10]:
# create rank within each grop of subject

windowdept = Window.partitionBy('subject').orderBy(col('mark').desc())

df4 = studf.withColumn('row',row_number().over(windowdept)).orderBy('name','row')
df4.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  1|
|    Avanti|female|Mathematics|  78|     P|        90|  2|
|    Avanti|female|    Physics|  77|     P|        90|  2|
|    Prachi|female|Mathematics|  90|     P|        72|  1|
|    Prachi|female|  chemistry|  80|     P|        72|  2|
|    Prachi|female|    Physics|  72|     P|        72|  3|
|   Shruthi|Female|    Physics|  60|     P|        86|  4|
|Vaishanavi|female|  chemistry|  86|     P|        70|  1|
|Vaishanavi|female|Mathematics|  70|     P|        70|  3|
|Vaishanavi|female|    Physics|  50|     P|        70|  5|
|    mahesh|  Male|    Physics|  80|     P|        90|  1|
|    mahesh|  Male|  chemistry|  67|     P|        90|  3|
|    mahesh|  Male|Mathematics|  67|     P|        90|  4|
|   shruthi|female|  Chemistry|  67|     P|        86|  

In [12]:
# get top N rows per group of name

df5 = df4.filter(col("row") <= 1)
df5.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  1|
|    Prachi|female|Mathematics|  90|     P|        72|  1|
|Vaishanavi|female|  chemistry|  86|     P|        70|  1|
|    mahesh|  Male|    Physics|  80|     P|        90|  1|
+----------+------+-----------+----+------+----------+---+



In [14]:
# create rank within each grop of subject

windowdept = Window.partitionBy('subject').orderBy(col('mark'))

df6 = studf.withColumn('row',row_number().over(windowdept)).orderBy('name','row')
df6.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  2|
|    Avanti|female|Mathematics|  78|     P|        90|  4|
|    Avanti|female|    Physics|  77|     P|        90|  4|
|    Prachi|female|  chemistry|  80|     P|        72|  2|
|    Prachi|female|    Physics|  72|     P|        72|  3|
|    Prachi|female|Mathematics|  90|     P|        72|  5|
|   Shruthi|Female|    Physics|  60|     P|        86|  2|
|Vaishanavi|female|    Physics|  50|     P|        70|  1|
|Vaishanavi|female|Mathematics|  70|     P|        70|  3|
|Vaishanavi|female|  chemistry|  86|     P|        70|  3|
|    mahesh|  Male|  chemistry|  67|     P|        90|  1|
|    mahesh|  Male|Mathematics|  67|     P|        90|  2|
|    mahesh|  Male|    Physics|  80|     P|        90|  5|
|   shruthi|female|  Chemistry|  67|     P|        86|  

In [15]:
# get top N rows per group of name

df7 = df6.filter(col("row") <= 1)
df5.show()

+----------+------+-----------+----+------+----------+---+
|      name|gender|    subject|mark|status|attendance|row|
+----------+------+-----------+----+------+----------+---+
|    Avanti|female|  Chemistry|  95|     P|        90|  1|
|    Prachi|female|Mathematics|  90|     P|        72|  1|
|Vaishanavi|female|  chemistry|  86|     P|        70|  1|
|    mahesh|  Male|    Physics|  80|     P|        90|  1|
+----------+------+-----------+----+------+----------+---+

