In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, max, when, expr

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Aug 22 Task") \
    .getOrCreate()


In [14]:
# Load CSV file
df = spark.read.csv('complete.csv', header=True, inferSchema=True)


In [15]:
df = df.withColumn('Name of State / UT', lower(col('Name of State / UT')))


day_with_max_cases = df.groupBy('Date').agg(max('Total Confirmed cases').alias('max_cases'))
day_with_max_cases = day_with_max_cases.orderBy(col('max_cases').desc()).first()
max_day = day_with_max_cases['Date']


from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Assuming 'total_cases' column exists and represents the number of covid cases
window = Window.orderBy(col('Total Confirmed cases').desc())
ranked_states = df.withColumn('rank', row_number().over(window))
second_largest_state = ranked_states.filter(col('rank') == 2).select('Name of State / UT').first()


least_deaths_ut = df.filter(col('Name of State / UT').isin(['Delhi', 'Puducherry', 'Chandigarh', 'Ladakh', 'Jammu & Kashmir'])) \
                   .groupBy('Name of State / UT') \
                   .agg(sum('Death').alias('total_deaths')) \
                   .orderBy('total_deaths') \
                   .first()


from pyspark.sql.functions import col

ratio_df = df.withColumn('death_to_confirmed_ratio', col('total_deaths') / col('Total Confirmed cases'))
state_with_lowest_ratio = ratio_df.orderBy('death_to_confirmed_ratio').first()


from pyspark.sql.functions import month, date_format

# Assuming there's a 'new_recovered' column
df = df.withColumn('month', month('Date'))
monthly_recovered = df.groupBy('month').agg(sum('new_recovered').alias('total_recovered'))
month_with_max_recovered = monthly_recovered.orderBy(col('total_recovered').desc()).first()

# Convert month number to name
month_mapping = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
    7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
month_name = month_mapping.get(month_with_max_recovered['month'], 'Unknown')


print(f"Day with maximum Covid cases: {max_day}")
print(f"State with second-largest number of Covid cases: {second_largest_state['Name of State / UT']}")
print(f"Union Territory with the least number of deaths: {least_deaths_ut['Name of State / UT']}")
print(f"State with the lowest Death to Total Confirmed cases ratio: {state_with_lowest_ratio['Name of State / UT']}")
print(f"Month with the most new recovered cases: {month_name}")


TypeError: unsupported operand type(s) for +: 'int' and 'str'