In [1]:
import pandas as pd
import xlrd
import cleaning_functions as clean

In [2]:
df = pd.read_excel('https://www.sharkattackfile.net/spreadsheets/GSAF5.xls')

In [3]:
# Removing unnecessary columns, selecting Florida data, standardizing column name
df_florida = clean.trim_dataset(df) 
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,species
0,2024-10-11 00:00:00,Unprovoked,Brevard County Orlando,Surfing,M,16.0,Bite to left arm,?,Bull shark 6ft
1,2024-07-08 00:00:00,Unprovoked,Ponce de Leon Inlet Volusia County,Diving into Water,M,14.0,Lower left leg injury,11hr15,4-5ft Blacktip shark
2,2024-07-05 00:00:00,Unprovoked,New Smyrna Beach,Wading,M,26.0,Minor injury to left foot,16hr15,Not specified
3,2024-07-04 00:00:00,Unprovoked,New Smyrna Beach Beach Access 14 Volusia County,Swimming,M,,Bite to left leg,1100hr,6ft shark
4,2024-06-07 00:00:00,Unprovoked,Rosemary Beach Walton County,Swimming,F,45.0,Significant injuries to pelvic and abdominal r...,13h15,Not specified Bull shark highly suspect


In [4]:
#Convert everything in the activity column to string and strip any trailing whitespace
df_florida['activity'] = clean.standardize_string(df_florida['activity']) 
#Standardize and classify the activities
df_florida['activity'] = df_florida['activity'].apply(clean.standardize_activity)
df_florida['activity'].head()

0     surfing
1      diving
2      wading
3    swimming
4    swimming
Name: activity, dtype: object

In [12]:
#Convert everything in the type column to string and strip any trailing whitespace
df_florida['type'] = clean.standardize_string(df_florida['type'])
#Standardize and classify the types of attacks
df_florida['type'] = df_florida['type'].apply(clean.standardize_type)
df_florida['type'].head()

0    unprovoked
1    unprovoked
2    unprovoked
3    unprovoked
4    unprovoked
Name: type, dtype: object

In [16]:
#Convert everything in the location column to string and strip any trailing whitespace
df_florida['location'] = clean.standardize_string(df_florida['location'])
#Standardize and classify the locations and separate them into precise_location and county column
df_florida = clean.get_county_from_location(df_florida)
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,species,precise_location,county
0,2024-10-11 00:00:00,unprovoked,brevard county orlando,surfing,M,16.0,Bite to left arm,?,Bull shark 6ft,other,brevard county
1,2024-07-08 00:00:00,unprovoked,ponce de leon inlet volusia county,diving,M,14.0,Lower left leg injury,11hr15,4-5ft Blacktip shark,other,volusia county
2,2024-07-05 00:00:00,unprovoked,new smyrna beach,wading,M,26.0,Minor injury to left foot,16hr15,Not specified,new smyrna beach,volusia county
3,2024-07-04 00:00:00,unprovoked,new smyrna beach beach access 14 volusia county,swimming,M,,Bite to left leg,1100hr,6ft shark,new smyrna beach,volusia county
4,2024-06-07 00:00:00,unprovoked,rosemary beach walton county,swimming,F,45.0,Significant injuries to pelvic and abdominal r...,13h15,Not specified Bull shark highly suspect,other,other


In [30]:
#Cleaning for sex column
df_florida["sex"] = df_florida["sex"].apply(clean.clear_sex)
#Forward filling the nan values in age
df_florida["age"] = df_florida['age'].ffill()
#Removing trailing whitespace and converting to float
df_florida["age"] = df_florida["age"].astype(str).str.strip().astype(float)
#Adding age_group column to classify people's in age categories
df_florida['age_group'] = df_florida['age'].apply(clean.categorize_age)
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,species,precise_location,county
0,2024-10-11 00:00:00,unprovoked,brevard county orlando,surfing,M,16.0,Bite to left arm,?,Bull shark 6ft,other,brevard county
1,2024-07-08 00:00:00,unprovoked,ponce de leon inlet volusia county,diving,M,14.0,Lower left leg injury,11hr15,4-5ft Blacktip shark,other,volusia county
2,2024-07-05 00:00:00,unprovoked,new smyrna beach,wading,M,26.0,Minor injury to left foot,16hr15,Not specified,new smyrna beach,volusia county
3,2024-07-04 00:00:00,unprovoked,new smyrna beach beach access 14 volusia county,swimming,M,26.0,Bite to left leg,1100hr,6ft shark,new smyrna beach,volusia county
4,2024-06-07 00:00:00,unprovoked,rosemary beach walton county,swimming,F,45.0,Significant injuries to pelvic and abdominal r...,13h15,Not specified Bull shark highly suspect,other,other


In [36]:
#Standardizing date format
df_florida['date'] = df_florida['date'].astype(str).apply(clean.clean_date_prefix)
df_florida['date'] = pd.to_datetime(df_florida['date'], format='mixed', dayfirst=True, errors='coerce')
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,species,precise_location,county
0,2024-10-11,unprovoked,brevard county orlando,surfing,M,16.0,Bite to left arm,?,Bull shark 6ft,other,brevard county
1,2024-07-08,unprovoked,ponce de leon inlet volusia county,diving,M,14.0,Lower left leg injury,11hr15,4-5ft Blacktip shark,other,volusia county
2,2024-07-05,unprovoked,new smyrna beach,wading,M,26.0,Minor injury to left foot,16hr15,Not specified,new smyrna beach,volusia county
3,2024-07-04,unprovoked,new smyrna beach beach access 14 volusia county,swimming,M,26.0,Bite to left leg,1100hr,6ft shark,new smyrna beach,volusia county
4,2024-06-07,unprovoked,rosemary beach walton county,swimming,F,45.0,Significant injuries to pelvic and abdominal r...,13h15,Not specified Bull shark highly suspect,other,other


In [38]:
#Standardizing and classifying time
df_florida = clean.clean_and_standardize_time(df_florida, "time")
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,species,precise_location,county
0,2024-10-11,unprovoked,brevard county orlando,surfing,M,16.0,Bite to left arm,unknown,Bull shark 6ft,other,brevard county
1,2024-07-08,unprovoked,ponce de leon inlet volusia county,diving,M,14.0,Lower left leg injury,unknown,4-5ft Blacktip shark,other,volusia county
2,2024-07-05,unprovoked,new smyrna beach,wading,M,26.0,Minor injury to left foot,morning,Not specified,new smyrna beach,volusia county
3,2024-07-04,unprovoked,new smyrna beach beach access 14 volusia county,swimming,M,26.0,Bite to left leg,afternoon,6ft shark,new smyrna beach,volusia county
4,2024-06-07,unprovoked,rosemary beach walton county,swimming,F,45.0,Significant injuries to pelvic and abdominal r...,morning,Not specified Bull shark highly suspect,other,other


In [40]:
#Renaming the species column to shark size
df_florida.rename(columns={'species': 'shark size'}, inplace=True)
#Standardize the species names before classification
df_florida['shark size'] = clean.standardize_string(df_florida['shark size'])
# Apply the size classification function to the 'shark size' column
df_florida['shark size'] = df_florida['shark size'].apply(clean.classify_size)
df_florida.head()

Unnamed: 0,date,type,location,activity,sex,age,injury,time,shark size,precise_location,county
0,2024-10-11,unprovoked,brevard county orlando,surfing,M,16.0,Bite to left arm,unknown,Medium Shark Species,other,brevard county
1,2024-07-08,unprovoked,ponce de leon inlet volusia county,diving,M,14.0,Lower left leg injury,unknown,Small Shark Species,other,volusia county
2,2024-07-05,unprovoked,new smyrna beach,wading,M,26.0,Minor injury to left foot,morning,Not Specified/Not Confirmed,new smyrna beach,volusia county
3,2024-07-04,unprovoked,new smyrna beach beach access 14 volusia county,swimming,M,26.0,Bite to left leg,afternoon,Medium Shark Species,new smyrna beach,volusia county
4,2024-06-07,unprovoked,rosemary beach walton county,swimming,F,45.0,Significant injuries to pelvic and abdominal r...,morning,Not Specified/Not Confirmed,other,other
