In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
# Create a new DataFrame
data = pd.read_csv('dados/attacks.csv', encoding='latin-1', low_memory=False)
pd.set_option('display.max_columns', None)

### Getting information about the data

In [None]:
data.shape

In [None]:
data.info()

### Cleaning collumns and lines with huge amount of NULL

In [None]:
# Verify values in collumns with huge amount of NULL
data['Unnamed: 22'].value_counts()
data['Unnamed: 22'].isnull().sum()
data['Unnamed: 23'].value_counts()
data['Unnamed: 23'].isnull().sum()
# Remove collumns without significant values
data = data.drop(['Unnamed: 22', 'Unnamed: 23'], axis = 1)

In [None]:
data

In [None]:
# Verify lines with huge amount of NULL
data['count_na'] = data.isna().sum(axis = 1)
high_na = data['count_na'] > 10
data.loc[high_na]
data['count_na'].describe()
# Remove lines filled with NULL
data = data.dropna(thresh = 10)

In [None]:
data

### Changing collumns names

In [None]:
# Change columns name
import regex as re
pattern = r'[^a-zA-Z0-9()/]'
data.columns = [re.sub(pattern, '_', column.lower().strip()) for column in data.columns]
data.columns

In [None]:
data.head(10)

In [None]:
data.tail(20)

In [None]:
data.head()

## Question #1 - Are men the most common victim of shark attacks?

In [175]:
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,count_na,list_age
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,0.0,[57]
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,1.0,[11]
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,1.0,[48]
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,2.0,
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,2.0,


In [176]:
data['sex'].value_counts()

M      5096
F       637
N         2
lli       1
.         1
Name: sex, dtype: int64

In [177]:
# Remove blank spaces from the collumn 'sex'
data['sex'] = data['sex'].str.strip()
#data['sex'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sex'] = data['sex'].str.strip()


In [178]:
data['sex'].value_counts()

M      5096
F       637
N         2
lli       1
.         1
Name: sex, dtype: int64

In [181]:
# Create new DataFrame man only
men_mask = data['sex'] == 'M'
data_men = data[men_mask]

## Answer 1 - Men are the most common victim with 5096 cases

## Question #2 - Are men under 30 years the most common victim?

In [182]:
# Check the inputs in age collumn
data_men['age'].unique()
data_men['age'].isna().sum()

2127

### Need to clean this collumn

In [None]:
# Define a function to clean age collumns - transform each input in a list of numbers
# Transforming into a list is needed because there are attack in multiple people at once
# Later the list of numbers will be exploded so we can count the multiple attacks

def age_into_list(age):

    age = str(age).strip()
    import re
    
    if len(age) == 0:
        return np.nan

    elif age.isdigit():
        return [int(age)]

    elif re.search('teen[s]*|young|month[s]*', age, re.I):
        return [10]

    elif re.search('.*adult.*|.*elder.*|.*middle.*', age, re.I):
        return [35]
    
    elif re.search('.?or.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return [aux[0]]
    
    elif re.search('.?&.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return aux

    elif re.search('.\d{2}[\'s]\?', age, re.I):
        aux = re.findall('\d{2}', age, re.I)
        return aux

    else:
        return np.nan

In [183]:
# Execute some tests at the new collumns
data_men['age'].apply(age_into_list).isnull().sum()
data_men['age'].apply(age_into_list).notna().sum()

2941

In [184]:
# Create a new collumn wiht the age clean
data_men['list_age'] = data_men['age'].apply(age_into_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_men['list_age'] = data_men['age'].apply(age_into_list)


In [185]:
# Check if we lost any data
new_total_entries = data_men['list_age'].notna().sum()
data_loss = data_men['age'].notna().sum() - new_total_entries
print(f'Total entries with cleaned age: {new_total_entries}\nEntries lost: {data_loss}\n')
# It's an acceptable number

Total entries with cleaned age: 2941
Entries lost: 28



In [203]:
# Explode lists of ages
data_men = data_men.explode('list_age', ignore_index=False)
data_men.shape
data_men['list_age'].unique()

array([48, nan, 18, 52, 15, 12, 32, 10, 21, 30, 60, 33, 29, 54, 34, 41,
       37, 56, 19, 25, 69, 38, 35, 45, 14, 28, 20, 24, 26, 22, 7, 31, 17,
       13, 42, 40, 50, 46, 82, 73, 68, 16, 39, 58, 55, 57, 47, 61, 65, 36,
       43, 9, 49, 72, 59, 11, 27, 64, 23, 71, 44, 62, 6, 63, 70, 53, 8,
       51, 66, 77, 74, 3, '28', '26', 5, 86, '12', '46', '34', 84, '30',
       '21', 75, '33', '23', '20', '7', '31', '32', 87, 67, '19', 1, '37',
       '25', '67', '35', '27', '24', '17', '13', 81, 78, '9', '36', '14',
       '10'], dtype=object)

In [198]:
# Check 
under_30_mask = data_men['list_age'] <= 30


TypeError: '<=' not supported between instances of 'str' and 'int'

## Question - Do the majority number os attacks occur in summer?

In [None]:
# Check if the collumn that contaits the date of the attack are in date format
data['date'].info
list(data['date'].unique())
#list(data['case_number'].unique())
#pd.to_datetime(data['date'], format ='%d-%b-%Y')


In [None]:
mask_contains_date = data['date'].str.contains('\d{2}-\w{3}-\d{4}', regex=True)
data.loc[mask_contains_date, 'date'].tail(30)
#data.loc[mask_contains_date, 'new_date'] = data['date'].str.replace('.*\d{2}-\w{3}-\d{4}.*','\d{2}-\w{3}-\d{4}', regex=True)



In [None]:
def replace_date(line):
    import re
    look_for = '.*\d{2}-\w{3}-\d{4}.*'
    replace = '\d{2}-\w{3}-\d{4}'
    if re.search(look_for, line):
        return re.sub(look_for, replace, line.lower())

In [None]:
replace_date('reported 10-Jan-1995')