Question 2:
# To determine the number of rows and columns
num_rows, num_cols = df.shape
print(f"The dataset has {num_rows} rows and {num_cols} columns.")





Question 3:
# To display the last 75 rows
print("Last 75 rows of the dataset:")
print(df.tail(75))


Question 4:
 Column Type	Best Method to Handle Missing Data	Why This Method Works Well
Numerical columns (e.g., Aboard, Fatalities)	Fill with the Mean or Median	If the numbers are evenly spread (normally distributed), using the mean makes sense. If the data has extreme values or is skewed, use the median, since it’s not affected by outliers. This keeps the number of records the same and maintains the column’s general shape.

Categorical columns (e.g., Location, Operator)	Fill with the Mode (most frequent value)	For text-based or categorical data, it’s usually safe to assume that missing values belong to the most common category. This approach is simple, preserves all rows, and works well for nominal data (where order doesn’t matter).

Categorical or Text columns with few missing values	Replace with a constant (like “Unknown” or “Missing”)	Sometimes it’s best to mark missing entries explicitly. Using labels like “Unknown” lets you keep all data and signals that information was missing — instead of guessing what it might have been.

Any column with very high missingness (over 70%)	Drop the column entirely	When most values are missing, the column doesn’t provide meaningful insight. Trying to fill in so many gaps could distort the data and make the analysis unreliable. Removing it avoids introducing bias.


Question 5:
# Create the new dataframe
fatality_locations = df[['Date', 'Location', 'Aboard', 'Fatalities']].copy()

print("The new 'fatality_locations' DataFrame (first 5 rows):")
print(fatality_locations.head())


Question 6:

fatality_locations['Fatalities'] = pd.to_numeric(fatality_locations['Fatalities'], errors='coerce')

# Find the row with the maximum fatalities
highest_fatality_record = fatality_locations.loc[fatality_locations['Fatalities'].idxmax()]

# Get the date
date_of_highest_fatalities = highest_fatality_record['Date']

print(f"The date of the highest number of recorded fatalities is: {date_of_highest_fatalities}")
print(f"Fatalities recorded: {highest_fatality_record['Fatalities']}")


Question 7:

fatality_locations['Aboard'] = pd.to_numeric(fatality_locations['Aboard'], errors='coerce')

# Compare the number of passengers aboard to the number of fatalities
comparison_result = fatality_locations[fatality_locations['Aboard'] >= fatality_locations['Fatalities']]

# Determine crashes with no fatalities (Fatalities == 0)
no_fatalities_crashes = fatality_locations[fatality_locations['Fatalities'] == 0]

# Count the number of such crashes
count_no_fatalities = len(no_fatalities_crashes)

print(f"Total number of recorded crashes: {len(fatality_locations)}")
print(f"Number of crashes where there were no fatalities: {count_no_fatalities}")

if count_no_fatalities > 0:
    print("\nYes, there are recorded crashes with no fatalities.")
else:
    print("\nNo, there are no recorded crashes with no fatalities.")


Question8:
# The location format is 'Region, U.S. State/Country'
# Use the .str.split() method on the comma delimiter
fatality_locations[['Region', 'State/Country']] = fatality_locations['Location'].str.split(', ', expand=True, n=1)

print("DataFrame after splitting Location column (first 5 rows):")
print(fatality_locations[['Location', 'Region', 'State/Country']].head())


Question 9:
# Sort the dataframe by 'Fatalities' in descending order
ordered_df = fatality_locations.sort_values(by='Fatalities', ascending=False)

# Select the first 100 rows
top_100_fatalities = ordered_df.head(100)

print("First 5 rows of the top 100 crashes by fatalities:")
print(top_100_fatalities.head())


Question 10:
import matplotlib.pyplot as plt
import seaborn as sns  # While Seaborn is useful, Matplotlib is better for pie charts

# 1. Aggregate fatalities by State/Country
country_fatalities = fatality_locations.groupby('State/Country')['Fatalities'].sum().reset_index()

# 2. Get the top 25
top_25_countries = country_fatalities.sort_values(by='Fatalities', ascending=False).head(25)

# 3. Handle 'Other' for better visualization (optional but good practice)
total_fatalities = country_fatalities['Fatalities'].sum()
top_25_sum = top_25_countries['Fatalities'].sum()
other_fatalities = total_fatalities - top_25_sum

# Create a final dataframe for charting
chart_data = top_25_countries.copy()
# Append 'Other'
chart_data.loc[len(chart_data)] = ['Other', other_fatalities]

# 4. Generate the pie chart
plt.figure(figsize=(12, 12))
plt.pie(
    chart_data['Fatalities'],
    labels=chart_data['State/Country'],
    autopct='%1.1f%%',  # Format percentage text
    startangle=90,      # Start the first slice at the top
    wedgeprops={'edgecolor': 'black'}  # Add black borders for clarity
)

plt.title('Distribution of Fatalities by Country/U.S. State (Top 25 + Other)', fontsize=16)
plt.axis('equal')  # Ensures the pie is circular
plt.show()
