In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
forestfires_df = pd.read_csv(r'E:\DSBDAL\DSBDALExam DataSets\DSBDALExam DataSets\forestfires\forestfires.csv')

# Display first few rows
forestfires_df.head()




Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
# a) Create subsets based on 'area' (amount of region affected)

# Define custom function to classify area
def classify_area(area):
    if area == 0:
        return 'NotAffected'
    elif area <= 10:
        return 'PartiallyAffected'
    else:
        return 'MostlyAffected'

# Apply classification
forestfires_df['AreaClass'] = forestfires_df['area'].apply(classify_area)

# Create subsets
not_affected_df = forestfires_df[forestfires_df['AreaClass'] == 'NotAffected']
partially_affected_df = forestfires_df[forestfires_df['AreaClass'] == 'PartiallyAffected']
mostly_affected_df = forestfires_df[forestfires_df['AreaClass'] == 'MostlyAffected']

# Display counts of each subset
forestfires_df['AreaClass'].value_counts()


AreaClass
NotAffected          247
PartiallyAffected    175
MostlyAffected        95
Name: count, dtype: int64

In [4]:
# b) Merge two subsets (NotAffected and PartiallyAffected)

merged_df = pd.concat([not_affected_df, partially_affected_df], ignore_index=True)

# Display merged shape
merged_df.shape


(422, 14)

In [5]:
# c) Sort data by temp (descending), then wind (descending), then area (descending)

sorted_df = forestfires_df.sort_values(by=['temp', 'wind', 'area'], ascending=[False, False, False])

# Display top 5 after sorting
sorted_df[['temp', 'wind', 'area']].head()


Unnamed: 0,temp,wind,area
498,33.3,2.7,40.54
484,33.1,4.0,26.43
496,32.6,3.1,2.77
491,32.4,4.5,0.0
492,32.4,2.2,0.0


In [6]:
# d) Transpose data
transposed_df = forestfires_df.head(5).transpose()

# Display transposed data (only first 5 rows for readability)
transposed_df


Unnamed: 0,0,1,2,3,4
X,7,7,7,8,8
Y,5,4,4,6,6
month,mar,oct,oct,mar,mar
day,fri,tue,sat,fri,sun
FFMC,86.2,90.6,90.6,91.7,89.3
DMC,26.2,35.4,43.7,33.3,51.3
DC,94.3,669.1,686.9,77.5,102.2
ISI,5.1,6.7,6.7,9.0,9.6
temp,8.2,18.0,14.6,8.3,11.4
RH,51,33,33,97,99


In [7]:
# e) Melt data into long format

melted_df = pd.melt(forestfires_df, id_vars=['month', 'day'], value_vars=['temp', 'RH', 'wind', 'rain'])

# Display first few rows of melted data
melted_df.head()


Unnamed: 0,month,day,variable,value
0,mar,fri,temp,8.2
1,oct,tue,temp,18.0
2,oct,sat,temp,14.6
3,mar,fri,temp,8.3
4,mar,sun,temp,11.4


In [8]:
# f) Cast (pivot) melted data back to wide format

casted_df = melted_df.pivot_table(index=['month', 'day'], columns='variable', values='value').reset_index()

# Display casted data
casted_df.head()


variable,month,day,RH,rain,temp,wind
0,apr,fri,20.0,0.0,16.7,3.1
1,apr,mon,64.0,0.0,10.9,3.1
2,apr,sat,44.0,0.0,9.3,4.5
3,apr,sun,45.0,0.0,14.9,5.666667
4,apr,thu,54.0,0.0,5.8,5.8
