# SHARK ATTACKS 🦈

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib as plt
from IPython import display
import matplotlib.pyplot as plt
import squarify
from wordcloud import WordCloud, STOPWORDS
sns.set_theme(style="white", palette="Spectral")

## 1. Data Cleaning

First I downloaded the dataframe and cleaned it. The data had more than 25 thousand rows and 24 columns, but around 79% of the data was missing. So I got rid of more than 20 thousand rows and more than 10 columns which didn't add anything to my analysis. I also extracted the month from the Date column, and got rid of the day, and changed the name of some columns, to make data manipulation easier later on.

In [None]:
df = pd.read_csv("/Users/narea/Desktop/ironhack/Project-1/data/attacks.csv", encoding = 'unicode_escape')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.iloc[100]

As we can see in the following table, most of the columns are missing at least 75% of the data. That's why I got rid of missing values to get a smaller but more valuable dataframe. 

In [None]:
df_nan = df.isna().sum()
percent = df_nan * 100 / len(df)
missing_values = pd.DataFrame({'Missing values': df_nan, 'Missing %': percent})
missing_values

In [None]:
missing_values.mean()

In [None]:
df.dropna(subset = ['Age'], inplace = True)

In [None]:
df_nan = df.isna().sum()
percent = df_nan * 100 / len(df)
missing_values = pd.DataFrame({'Missing values': df_nan, 'Missing %': percent})
missing_values

In [None]:
df.drop(["Unnamed: 22", "Unnamed: 23", "href", "Case Number.1", "Case Number.2", "pdf", "Case Number", "href formula", "Investigator or Source","original order","Type","Injury","Location","Area", "Name"], axis=1, inplace=True)

In [None]:
# extracting the month from the date column with regex, and deleting the date column
df['Month'] = df['Date'].str.extract('(-\D{3}-)', expand=True)
# we still need to take out the (-)'s   
regex_ = [r"(-\D{3}-): ", r"-"]
df['Month'] = df['Month'].replace(regex=regex_, value="")
df.drop(["Date"], axis=1, inplace=True)

In [None]:
# changing the name of some columns to make it easier to work with the data. 
df['Fatal (Y/N)'] = df['Fatal (Y/N)'].str.strip().str.upper()
df['Sex '] = df['Sex '].str.strip().str.upper()
df = df[((df['Fatal (Y/N)']=='N') | (df['Fatal (Y/N)']=='Y')) & ((df['Sex ']=='M') | (df['Sex ']=='F'))]
df['Sex'] = df['Sex '].str.rstrip()
df = df.drop("Sex ", axis=1)
df['Species'] = df['Species '].str.rstrip()
df = df.drop("Species ", axis=1)

In [None]:
# using regex to keep just the values of the age column
df['Age'] = df['Age'].dropna().apply(lambda x: re.findall(r"\d{2}",x))  
df['Age'] = df['Age'].str[0] 
df['Age'] = df['Age'].astype(float) 

# Exploratory Data Analysis

## 1.1. Year

To explore the attacks per year I focused on the data from 1800 to 2018, and got rid of some outliers the dataframe had. As we can see in the following graph, attacks have been increasing in the last 60 years, this may be due to the increase of ocean related activities in the last decades. Still, even though there have been more attacks, the probablity of dying from one of these, has been lower each year. Even if there are more attacks nowadays, the number of fatal victims has stayed the same. 

In [None]:
df = df[(df['Year']>=1800)]

In [None]:
df['Year'] = df[['Year']].astype(int)

In [None]:
years = df['Year'].value_counts().sort_index()
%matplotlib inline
years.plot(title="Shark Attacks per year")
plt.savefig('./images/year.png')

In [None]:
sns.histplot(data=df, x='Year', hue='Fatal (Y/N)', multiple='stack', kde=True)
plt.savefig('./images/year_fatal.png')

## 1.2. Age

By exploring the age of the victims, it's easy to see that the youth is more affected by shark attacks, probably due to the presence of this age range in the ocean. The people most affected are the one's from age 18-20. 

In [None]:
df = df[(df['Age']<=90)]

In [None]:
age = df['Age'].value_counts().sort_index()
sns.histplot(data=df, x='Age', hue='Fatal (Y/N)', multiple='stack')
plt.savefig('./images/age.png')

## 1.3. Sex

Men are more probable to be attacked by a shark. 

In [None]:
sns.countplot(data=df, x="Sex", hue="Fatal (Y/N)")
plt.savefig('./images/sex.png')

## 1.4. Species

And the oscar to the shark with more attacks goes to the one and only...white shark. Steven Spielberg knew who to cast for his Oscar winning movie. 

In [None]:
df['Species'].value_counts()[:10]

In [None]:
sharks = df['Species'].value_counts()
sharks[sharks>36].plot.barh(color='orange')
plt.savefig('./images/species.png')

## 1.5. Countries

If you don't like thrilling, pulse-raising or breath-taking adventures, these are the places to avoid. If you are the opposite, these are the places to go. The 10 countries with most shark attacks are the shown in the next table. 

In [None]:
df['Country'].value_counts()[:10]

In [None]:
countries = df['Country'].value_counts()
countries[countries>26].plot.barh(color='orange')
plt.savefig('./images/countries.png')

## 1.6. Activity

In [None]:
df['Activity'].value_counts()[:10]

In [None]:
activities = df['Activity'].value_counts()
activities[:10].plot.barh(color='orange')
plt.savefig('./images/act.png')

## H1: shark attacks are more likely to occur in the afternoon

My first hypothesis was that shark attacks are more likely to occur in the afternoon. 

In [None]:
# Using regex to extract just the numbers of the time column and keeping the numbers smaller than 24
# since there are some outliers in the data
df['Time'] = df['Time'].str.extract("([0-9]+)", expand=False).dropna().astype(int)

In [None]:
df = df[(df['Time']<=24)]

In [None]:
sns.histplot(data=df, x="Time")
plt.savefig('./images/time.png')

## H2: summer is the season with most shark attacks. 

In [None]:
country = df['Country'].unique()
print(country)

In [None]:
# making a list with all the countries from the northern hemisphere and the southern hemisphere
# and making a dictionary to know which season corresponds to each month in both hemispheres
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
season_north = {'Jan':'winter','Feb':'winter','Mar':'spring','Apr':'spring','May':'spring','Jun':'summer',
           'Jul':'summer','Aug':'summer','Sep':'autumn','Oct':'autumn','Nov':'autumn', 'Dec':'winter'}
season_south = {'Jan':'summer','Feb':'summer','Mar':'autumn','Apr':'autumn','May':'autumn','Jun':'winter',
           'Jul':'winter','Aug':'winter','Sep':'spring','Oct':'spring','Nov':'spring', 'Dec':'summer'}
north = ['USA','ENGLAND', 'COSTA RICA', 'BAHAMAS',' TONGA','AMERICAN SAMOA','ARUBA', 'AZORES','BARBADOS', 'BERMUDA', 'BELIZE','BRITISH ISLES',
      'DOMINICAN REPUBLIC','CROATIA','CUBA','CHINA','IRAQ','ADMIRALTY ISLANDS', 'SCOTLAND','RUSSIA','PORTUGAL','PALA','IRAN','ISRAEL','ITALY','JAPAN','COLUMBIA','CANADA','CENTRAL PACIFIC',
       'CARIBBEAN SEA','ST. MAARTIN','ST. MARTIN', 'TRINIDAD & TOBAGO', 'TURKS & CAICOS','TONGA','TAIWAN','VIETNAM','THAILAND', 'SOUTH CHINA SEA', 'SOUTH KOREA','UNITED KINGDOM',
         'UNITED ARAB EMIRATES (UAE)','UNITED ARAB EMIRATES','SRI LANKA', 'PUERTO RICO','PHILIPPINES','TURKEY','SPAIN','SINGAPORE','PALESTINIAN TERRITORIES','SOMALIA','SIERRA LEONE',
         'PANAMA','SENEGAL', 'SAUDI ARABIA','OKINAWA','NICARAGUA','NIGERIA','MICRONESIA','NEW BRITAIN','MID ATLANTIC OCEAN','MARSHALL ISLANDS', 
         'MALAYSIA','JAMAICA', 'INDIA', 'HONG KONG','HONDURAS','GUINEA','GUAM','GRENADA','EL SALVADOR', 'CAYMAN ISLANDS','GRAND CAYMAN','FEDERATED STATES OF MICRONESIA', 'MALTA', 
         'GREECE','FRANCE','MEXICO','NORWAY','BRITISH VIRGIN ISLANDS', 'BRITISH WEST INDIES',]
south = ['AUSTRALIA', 'MALDIVES', 'SOUTH AFRICA', 'ARGENTINA','PAPUA NEW GUINEA', 'WESTERN SAMOA','URUGUAY','TANZANIA', 'SOLOMON ISLANDS','SAMOA','SEYCHELLES',
         'ANDAMAN / NICOBAR ISLANDAS', 'BRAZIL','BRITISH NEW GUINEA','CHILE','CAPE VERDE','ECUADOR','FIJI','Fiji',
      'DIEGO GARCIA','NEW GUINEA','VANUATU','VENEZUELA','NEW CALEDONIA','MOZAMBIQUE','NEW ZEALAND', 'KENYA','KIRIBATI',
         'MADAGASCAR','MAURITIUS','FRENCH POLYNESIA','INDONESIA','EGYPT', 'INDIA']

In [None]:
# creating a function that returns either northern or southern hemisphere depending on which list the country is located
def hemisphere(country):
    if country in north:
        return 'Northern Hemisphere'
    elif country in south:
        return 'Southern Hemisphere'
    else:
        return 

In [None]:
# creating a new column with the values
df['Hemisphere'] = df['Country'].apply(lambda x: hemisphere(x))

In [None]:
sns.countplot(x=df["Hemisphere"])
plt.savefig('./images/hemisphere.png')

In [None]:
def season(hemisphere,value):
    if hemisphere == 'Southern Hemisphere':
        return season_south[value]
    elif hemisphere == 'Northern Hemisphere':
        return season_north[value]
    else:
        return 

In [None]:
season('Southern Hemisphere','Jun')

In [None]:
# this worked before, and I got the graph and everything, but I don't know what I changed that it stoped working
# i am going to keep trying to make it work. 
season(df['Hemisphere'],df['Month'])

In [None]:
df['Season'] = df.apply(lambda x: season(x['Hemisphere'], x['Month']))

In [None]:
## sns.counttplot(x=df["Season"], hue= df['Hemisphere'])
## plt.savefig('./images/season.png')