In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
# Create a new DataFrame
data = pd.read_csv('dados/attacks.csv', encoding='latin-1', low_memory=False)
pd.set_option('display.max_columns', None)

### Getting information about the data

In [None]:
data.shape

In [None]:
data.info()

### Cleaning collumns and lines with huge amount of NULL

In [None]:
# Verify values in collumns with huge amount of NULL
data['Unnamed: 22'].value_counts()
data['Unnamed: 22'].isnull().sum()
data['Unnamed: 23'].value_counts()
data['Unnamed: 23'].isnull().sum()
# Remove collumns without significant values
data = data.drop(['Unnamed: 22', 'Unnamed: 23'], axis = 1)

In [None]:
data

In [None]:
# Verify lines with huge amount of NULL
data['count_na'] = data.isna().sum(axis = 1)
high_na = data['count_na'] > 10
data.loc[high_na]
data['count_na'].describe()
# Remove lines filled with NULL
data = data.dropna(thresh = 10)

In [None]:
data

### Changing collumns names

In [None]:
# Change columns name
import regex as re
pattern = r'[^a-zA-Z0-9()/]'
data.columns = [re.sub(pattern, '_', column.lower().strip()) for column in data.columns]
data.columns

In [None]:
data.head(10)

In [None]:
data.tail(20)

In [None]:
data.head()

## Question #1 - Are men the most common victim of shark attacks?

In [None]:
data.head()

In [None]:
data['sex'].value_counts()

In [None]:
# Remove blank spaces from the collumn 'sex'
data['sex'] = data['sex'].str.strip()
#data['sex'].unique()

In [None]:
data['sex'].value_counts()

In [None]:
# Create new DataFrame man only
men_mask = data['sex'] == 'M'
data_men = data[men_mask]

## Answer #1 - Men are the most common victim with 5096 cases

## Question #2 - Are men under 30 years the most common victim?

In [None]:
# Check the inputs in age collumn
data_men['age'].unique()
data_men['age'].isna().sum()

### Need to clean this collumn

In [None]:
# Define a function to clean age collumns - transform each input in a list of numbers
# Transforming into a list is needed because there are attack in multiple people at once
# Later the list of numbers will be exploded so we can count the multiple attacks

def age_into_list(age):

    age = str(age).strip()
    import re
    
    if len(age) == 0:
        return np.nan

    elif age.isdigit():
        return [int(age)]

    elif re.search('teen[s]*|young|month[s]*', age, re.I):
        return [10]

    elif re.search('.*adult.*|.*elder.*|.*middle.*', age, re.I):
        return [35]
    
    elif re.search('.?or.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return [aux[0]]
    
    elif re.search('.?&.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return aux

    elif re.search('.\d{2}[\'s]\?', age, re.I):
        aux = re.findall('\d{2}', age, re.I)
        return aux
   
    else:
        return np.nan

In [None]:
# Execute some tests at the new collumns
data_men['age'].apply(age_into_list).isnull().sum()
data_men['age'].apply(age_into_list).notna().sum()

In [None]:
# Create a new collumn wiht the age clean
data_men['list_age'] = data_men['age'].apply(age_into_list)

In [None]:
# Check if we lost any data
new_total_entries = data_men['list_age'].notna().sum()
data_loss = data_men['age'].notna().sum() - new_total_entries
print(f'Total entries with cleaned age: {new_total_entries}\nEntries lost: {data_loss}\n')
# It's an acceptable number

In [None]:
# Explode lists of ages
data_men = data_men.explode('list_age', ignore_index=False)
data_men.shape
# Create new collumn age as int
data_men.loc[data_men['list_age'].notna(), 'age_int'] = data_men[data_men['list_age'].notna()]['list_age'].astype('int16')
data_men.dtypes

In [None]:
# Check if the young are attacked more often as the elder
age_notna_mask = data_men['age_int'].notna()
under_30_mask = (data_men['age_int'] <= 30)
count_man_under30 = data_men.loc[under_30_mask & age_notna_mask, 'age_int'].count()
count_man_over30 = data_men.loc[~under_30_mask & age_notna_mask, 'age_int'].count()
print(f'Number of attacs in men under 30: {count_man_under30}\nNumber o attacks in men over 30: {count_man_over30}')

In [None]:
# Create new DataFrame with men under 30
data_men_30 = data_men.loc[under_30_mask & age_notna_mask].reset_index(drop=True)

## Answer #2 - Men under 30 are attacked twice as often as elder
### Under 30: 1995
### Over 30: 965

## Question #3 - Do the sharks focus on hutting american?

6

In [None]:
# Check if there are many NULLs in country collumn
data_men_30['country'].isnull().sum()

In [310]:
# Check number of attacks in men under 30 for each country
data_men_30['country'].value_counts()

#Create mask to filter attacks in USA
us_mask = data_men_30['country'] == 'USA'

# % of attakcs on americans
us_percentage = (data_men_30.loc[us_mask, 'country'].count() / data_men_30.shape[0])
print(f'Us attack percentagem: {us_percentage}')

Us attack percentagem: 0.41604010025062654


In [305]:
# Create DataFrame with attacks at USA
us_mask = data_men_30['country'] == 'USA'
data_men_30_us = data_men_30[us_mask].reset_index(drop=True)

## Question - Do the majority number os attacks occur in summer?

In [None]:
# Check if the collumn that contaits the date of the attack are in date format
data['date'].info
list(data['date'].unique())
#list(data['case_number'].unique())
#pd.to_datetime(data['date'], format ='%d-%b-%Y')


In [None]:
mask_contains_date = data['date'].str.contains('\d{2}-\w{3}-\d{4}', regex=True)
data.loc[mask_contains_date, 'date'].tail(30)
#data.loc[mask_contains_date, 'new_date'] = data['date'].str.replace('.*\d{2}-\w{3}-\d{4}.*','\d{2}-\w{3}-\d{4}', regex=True)



In [None]:
def replace_date(line):
    import re
    look_for = '.*\d{2}-\w{3}-\d{4}.*'
    replace = '\d{2}-\w{3}-\d{4}'
    if re.search(look_for, line):
        return re.sub(look_for, replace, line.lower())

In [None]:
replace_date('reported 10-Jan-1995')

In [None]:
data['case_number'].tail(100)