In [113]:
# Import libraries
import pandas as pd
import numpy as np

In [114]:
# Create a new DataFrame
data = pd.read_csv('dados/attacks.csv', encoding='latin-1', low_memory=False)
pd.set_option('display.max_columns', None)

### Getting information about the data

In [115]:
data.shape

(25723, 24)

In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

### Cleaning collumns and lines with huge amount of NULL

In [117]:
# Verify values in collumns with huge amount of NULL
data['Unnamed: 22'].value_counts()
data['Unnamed: 22'].isnull().sum()
data['Unnamed: 23'].value_counts()
data['Unnamed: 23'].isnull().sum()
# Remove collumns without significant values
data = data.drop(['Unnamed: 22', 'Unnamed: 23'], axis = 1)

In [118]:
data

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,Age,Injury,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25718,,,,,,,,,,,,,,,,,,,,,,
25719,,,,,,,,,,,,,,,,,,,,,,
25720,,,,,,,,,,,,,,,,,,,,,,
25721,,,,,,,,,,,,,,,,,,,,,,


In [119]:
# Verify lines with huge amount of NULL
data['count_na'] = data.isna().sum(axis = 1)
high_na = data['count_na'] > 10
data.loc[high_na]
data['count_na'].describe()
# Remove lines filled with NULL
data = data.dropna(thresh = 10)

In [120]:
data.shape

(6302, 23)

### Changing collumns names

In [121]:
# Change columns name
import regex as re
pattern = r'[^a-zA-Z0-9()/]'
data.columns = [re.sub(pattern, '_', column.lower().strip()) for column in data.columns]
data.columns

Index(['case_number', 'date', 'year', 'type', 'country', 'area', 'location',
       'activity', 'name', 'sex', 'age', 'injury', 'fatal_(y/n)', 'time',
       'species', 'investigator_or_source', 'pdf', 'href_formula', 'href',
       'case_number_1', 'case_number_2', 'original_order', 'count_na'],
      dtype='object')

In [122]:
data.head(10)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,count_na
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,1
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,1
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,2
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,2
5,2018.06.03.b,03-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,"Flat Rock, Ballina",Kite surfing,Chris,M,,"No injury, board bitten",N,,,"Daily Telegraph, 6/4/2018",2018.06.03.b-FlatRock.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.b,2018.06.03.b,6298.0,3
6,2018.06.03.a,03-Jun-2018,2018.0,Unprovoked,BRAZIL,Pernambuco,"Piedade Beach, Recife",Swimming,Jose Ernesto da Silva,M,18.0,FATAL,Y,Late afternoon,Tiger shark,"Diario de Pernambuco, 6/4/2018",2018.06.03.a-daSilva.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.03.a,2018.06.03.a,6297.0,0
7,2018.05.27,27-May-2018,2018.0,Unprovoked,USA,Florida,"Lighhouse Point Park, Ponce Inlet, Volusia County",Fishing,male,M,52.0,Minor injury to foot. PROVOKED INCIDENT,N,,"Lemon shark, 3'","K. McMurray, TrackingSharks.com",2018.05.27-Ponce.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.27,2018.05.27,6296.0,1
8,2018.05.26.b,26-May-2018,2018.0,Unprovoked,USA,Florida,"Cocoa Beach, Brevard County",Walking,Cody High,M,15.0,Lower left leg bitten,N,17h00,"Bull shark, 6'","K.McMurray, TrackingSharks.com",2018.05.26.b-High.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.b,2018.05.26.b,6295.0,0
9,2018.05.26.a,26-May-2018,2018.0,Unprovoked,USA,Florida,"Daytona Beach, Volusia County",Standing,male,M,12.0,Minor injury to foot,N,14h00,,"K. McMurray, Tracking Sharks.com",2018.05.26.a-DaytonaBeach.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.05.26.a,2018.05.26.a,6294.0,1


In [123]:
data.tail(20)

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,count_na
6282,ND.0020,1920 -1923,0.0,Unprovoked,AUSTRALIA,Queensland,Great Barrier Reef,,3 Japanese divers,M,,FATAL,Y,,,"V.M. Coppleson (1958), p.241",ND-0020-3JapaneseDivers.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0020,ND.0020,21.0,4
6283,ND.0019,Before 1921,0.0,Unprovoked,USA,Florida,"Gadsden Point, Tampa Bay",Fishing,James Kelley,M,,2-inch lacerations,N,,,"T. Helm, p.219",ND-0019-Kelley.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0019,ND.0019,20.0,3
6284,ND.0018,Before 1911,0.0,Unprovoked,VIETNAM,Ba Ria-Vung Tau Province,V?ng Tàu,Swimming around anchored ship,crewman,M,,Foot bitten,N,,,"Daily Kennebec Journal, 3/27/1911",ND-0018-Vietnam.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0018,ND.0018,19.0,3
6285,ND.0017,Before 1921,0.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Crew swimming alongside their anchored ship,male,M,,FATAL,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0017-alongside-ship.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0017,ND.0017,18.0,3
6286,ND.0016,Before 1921,0.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,4 men were bathing,male,M,,FATAL,Y,,,"Captain A. Anderson, Natal Mercury, 12/31/192...",ND-0016- Durban-PostOffice.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0016,ND.0016,17.0,3
6287,ND.0015,Before 1917,0.0,Unprovoked,FIJI,Moala Island,,Wreck of large double sailing canoe,20 Fijians,,,"FATAL, 18 people were killed by sharks, 2 sur...",Y,,,"Fijian Society papers presented April 17, 1918...",ND-0015-FijianCanoe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0015,ND.0015,16.0,5
6288,ND.0014,Before 17-Jul-1916,0.0,Unprovoked,USA,North Carolina,Somewhere between Hatteras and Beaufort,Swimming,"""youthful male""",M,,"""Lost leg""",N,,,"C. Creswell, GSAF; Wilmington Star, 7/17/1916",ND-0014-pre1916-NorthCarolina.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0014,ND.0014,15.0,3
6289,ND.0013,No date (3 days after preceding incident) & pr...,0.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Fishing,a native fisherman,M,,"FATAL, body not recovered but shark was caught...",Y,,,"Rural New Yorker, 7/19/1913",ND-0013-Durban-native-fisherman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0013,ND.0013,14.0,3
6290,ND.0012,Before 19-Jul-1913,0.0,Unprovoked,SOUTH AFRICA,KwaZulu-Natal,Durban,Wading,a young Scotsman,M,,"FATAL, leg stripped of flesh",Y,,,"Rural New Yorker, 7/19/1913",ND-0012-Durban-Scotsman.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0012,ND.0012,13.0,3
6291,ND.0011,Before 1911,0.0,Unprovoked,ASIA?,,,Swimming,Mr. Masury,M,,Foot severed,N,,,"Ref. J. T. Dubois in N.Y. Sun, 3/19/1911",ND-0011-Masury.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0011,ND.0011,12.0,5


In [124]:
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,count_na
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,1
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,1
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,2
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,2


## Question #1 - Are men most likely to be attacked than women?

In [125]:
data.head()

Unnamed: 0,case_number,date,year,type,country,area,location,activity,name,sex,age,injury,fatal_(y/n),time,species,investigator_or_source,pdf,href_formula,href,case_number_1,case_number_2,original_order,count_na
0,2018.06.25,25-Jun-2018,2018.0,Boating,USA,California,"Oceanside, San Diego County",Paddling,Julie Wolfe,F,57.0,"No injury to occupant, outrigger canoe and pad...",N,18h00,White shark,"R. Collier, GSAF",2018.06.25-Wolfe.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.25,2018.06.25,6303.0,0
1,2018.06.18,18-Jun-2018,2018.0,Unprovoked,USA,Georgia,"St. Simon Island, Glynn County",Standing,Adyson McNeely,F,11.0,Minor injury to left thigh,N,14h00 -15h00,,"K.McMurray, TrackingSharks.com",2018.06.18-McNeely.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.18,2018.06.18,6302.0,1
2,2018.06.09,09-Jun-2018,2018.0,Invalid,USA,Hawaii,"Habush, Oahu",Surfing,John Denges,M,48.0,Injury to left lower leg from surfboard skeg,N,07h45,,"K.McMurray, TrackingSharks.com",2018.06.09-Denges.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.09,2018.06.09,6301.0,1
3,2018.06.08,08-Jun-2018,2018.0,Unprovoked,AUSTRALIA,New South Wales,Arrawarra Headland,Surfing,male,M,,Minor injury to lower leg,N,,2 m shark,"B. Myatt, GSAF",2018.06.08-Arrawarra.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.08,2018.06.08,6300.0,2
4,2018.06.04,04-Jun-2018,2018.0,Provoked,MEXICO,Colima,La Ticla,Free diving,Gustavo Ramos,M,,Lacerations to leg & hand shark PROVOKED INCIDENT,N,,"Tiger shark, 3m",A .Kipper,2018.06.04-Ramos.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2018.06.04,2018.06.04,6299.0,2


In [126]:
data['sex'].value_counts()

M      5094
F       637
M         2
N         2
lli       1
.         1
Name: sex, dtype: int64

In [127]:
# Remove blank spaces from the collumn 'sex'
data['sex'] = data['sex'].str.strip()
#data['sex'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sex'] = data['sex'].str.strip()


In [128]:
data['sex'].value_counts()

M      5096
F       637
N         2
lli       1
.         1
Name: sex, dtype: int64

In [129]:
# Create new DataFrame man only
men_mask = data['sex'] == 'M'
data_men = data[men_mask]

# Verify the number of attacks per genre
data[data['sex']=='M'].shape[0]
data[data['sex']=='F'].shape[0]
print(data[data['sex']=='M'].shape[0] / (data[data['sex']=='F'].shape[0] + data[data['sex']=='M'].shape[0]))
data_men.shape
data.shape

0.8888888888888888


(6302, 23)

## Answer #1 - Men are the most common victim with 5096 out of 5733 cases (89%).

## Question #2 - Do the sharks preffer to atack younger people?

In [130]:
# Check the inputs in age collumn
data_men['age'].unique()
data_men['age'].isna().sum()

2127

### Need to clean this collumn

In [131]:
# Define a function to clean age collumns - transform each input in a list of numbers
# Transforming into a list is needed because there are attack in multiple people at once
# Later the list of numbers will be exploded so we can count the multiple attacks

def age_into_list(age):

    age = str(age).strip()
    import re
    
    if len(age) == 0:
        return np.nan

    elif age.isdigit():
        return [int(age)]

    elif re.search('teen[s]*|young|month[s]*', age, re.I):
        return [10]

    elif re.search('.*adult.*|.*elder.*|.*middle.*', age, re.I):
        return [35]
    
    elif re.search('.?or.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return [aux[0]]
    
    elif re.search('.?&.?', age, re.I):
        aux = re.findall('\d+', age, re.I)
        return aux

    elif re.search('.\d{2}[\'s]\?', age, re.I):
        aux = re.findall('\d{2}', age, re.I)
        return aux
   
    else:
        return np.nan

In [132]:
# Execute some tests at the new collumns
data_men['age'].apply(age_into_list).isnull().sum()
data_men['age'].apply(age_into_list).notna().sum()

2941

In [133]:
# Create a new collumn wiht the age clean
data_men['list_age'] = data_men['age'].apply(age_into_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_men['list_age'] = data_men['age'].apply(age_into_list)


In [134]:
# Check if we lost any data
new_total_entries = data_men['list_age'].notna().sum()
data_loss = data_men['age'].notna().sum() - new_total_entries
print(f'Total entries with cleaned age: {new_total_entries}\nEntries lost: {data_loss}\n')
# It's an acceptable number

Total entries with cleaned age: 2941
Entries lost: 28



In [135]:
# Explode lists of ages
data_men = data_men.explode('list_age', ignore_index=False)
data_men.shape
# Create new collumn age as int
data_men.loc[data_men['list_age'].notna(), 'age_int'] = data_men[data_men['list_age'].notna()]['list_age'].astype('int16')
data_men.dtypes

case_number                object
date                       object
year                      float64
type                       object
country                    object
area                       object
location                   object
activity                   object
name                       object
sex                        object
age                        object
injury                     object
fatal_(y/n)                object
time                       object
species                    object
investigator_or_source     object
pdf                        object
href_formula               object
href                       object
case_number_1              object
case_number_2              object
original_order            float64
count_na                    int64
list_age                   object
age_int                   float64
dtype: object

In [136]:
# Check if the young are attacked more often as the elder
age_notna_mask = data_men['age_int'].notna()
under_30_mask = (data_men['age_int'] <= 30)
count_man_under30 = data_men.loc[under_30_mask & age_notna_mask, 'age_int'].count()
count_man_over30 = data_men.loc[~under_30_mask & age_notna_mask, 'age_int'].count()
print(f'Number of attacs in men under 30: {count_man_under30}\nNumber o attacks in men over 30: {count_man_over30}')

Number of attacs in men under 30: 1995
Number o attacks in men over 30: 965


In [137]:
# Create new DataFrame with men under 30
data_men_30 = data_men.loc[under_30_mask & age_notna_mask].reset_index(drop=True)

## Answer #2 - Youger men are attacked twice as often as elder ones (48%).
### Under 30: 1995
### Over 30: 965

## Question #3 - Do the sharks focus on hutting american?

In [138]:
# Check if there are many NULLs in country collumn
data_men_30['country'].isnull().sum()

6

In [139]:
# Check number of attacks in men under 30 for each country
data_men_30['country'].value_counts()

#Create mask to filter attacks in USA
us_mask = data_men_30['country'] == 'USA'

# % of attakcs on americans
us_percentage = (data_men_30.loc[us_mask, 'country'].count() / data_men_30.shape[0])
print(data_men_30.loc[us_mask, 'country'].count())
print(data_men_30.shape[0])
print(f'Us attack percentagem: {us_percentage}')

830
1995
Us attack percentagem: 0.41604010025062654


In [140]:
# Create DataFrame with attacks at USA
us_mask = data_men_30['country'] == 'USA'
data_men_30_us = data_men_30[us_mask].reset_index(drop=True)

## Answer #3 - 830 of 1995 attacks happened in US. This number represents 41% of total attacks.

## Question #4 - Do the majority number os attacks occur in summer?

In [141]:
# Get the month number
data_men_30_us['month_number'] = data_men_30_us['case_number'].str.findall(r'\d{4}\.(\d{2})\.').explode()

In [142]:
# Get rid of invalid values
data_men_30_us['month_number'].unique()
data_men_30_us['month_number'] = data_men_30_us['month_number'].fillna('00')
season_mask = data_men_30_us['month_number'] == '00'
data_men_30_us_season = data_men_30_us[~season_mask]

In [143]:
mask_summer = data_men_30_us_season['month_number'].str.contains('06|07|08')
mask_winter = data_men_30_us_season['month_number'].str.contains('12|01|02')
mask_spring = data_men_30_us_season['month_number'].str.contains('03|04|05')
mask_autumn = data_men_30_us_season['month_number'].str.contains('09|10|11')
data_men_30_us_season.loc[mask_summer, 'season'] = 'summer'
data_men_30_us_season.loc[mask_winter, 'season'] = 'winter'
data_men_30_us_season.loc[mask_spring, 'season'] = 'spring'
data_men_30_us_season.loc[mask_autumn, 'season'] = 'autumn'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_men_30_us_season.loc[mask_summer, 'season'] = 'summer'


In [144]:
data_men_30_us_season.groupby(by='season')['case_number'].count()

season
autumn    251
spring    160
summer    353
winter     58
Name: case_number, dtype: int64

In [145]:
data_men_30_us_season.shape

(822, 27)

## Answer 4 - 353 out of 822 attacks occurred on summer (43%)
### Winter: 58
### Spring: 160
### Summer: 353
### Autumn: 251