In [4]:
import pandas as pd

df = pd.read_csv('./data/clean_dataset.csv')
df[['gender','citizenship','event_location_region','date_of_event',
'date_of_death','type_of_injury','killed_by','year_of_death']] = df[['gender','citizenship','event_location_region',
'date_of_event','date_of_death','type_of_injury','killed_by','year_of_death']].astype('string')

In [10]:
df[df['age'] == 21]['age'].value_counts()

21.0    610
Name: age, dtype: int64

In [5]:
# age
# create copy of df without 0 values
ages = df[df['age'] != 0]
print("Average age of fatalities: {:.2f} years old".format(ages['age'].mean()))
print("Youngest fatality: {} year(s) old".format(ages['age'].min()))
# print("Youngest fatality: {} year(s) old".format(int(df['age'].where(df['age'].gt(0)).min(0))))
print("Oldest fatality: {} years old".format(int(ages['age'].max())))

age_groups = {
    "<18" : 0, "18-24" : 0, "25-34" : 0, "35-44" : 0,
    "45-54" : 0, "55-64" : 0, "65+": 0, "unidentified": 0
}
for i in range(len(df)):
    curr = df.iloc[i]['age']
    if curr in range(1,18):
        age_groups['<18'] += 1
    elif curr in range(18,25):
        age_groups['18-24'] += 1
    elif curr in range(25,35):
        age_groups['25-34'] += 1
    elif curr in range(35,45):
        age_groups['35-44'] += 1
    elif curr in range(45,55):
        age_groups['45-54'] += 1
    elif curr in range(55,65):
        age_groups['55-64'] += 1
    elif curr >= 65:
        age_groups['65+'] += 1
    else:
        age_groups['unidentified'] +=1
print("Fatalities by age group:")
for group in age_groups:
    print("{} : {}".format(group, age_groups[group]))

# gender
print("\nFatalities by gender")
print("Male: {}".format(df['gender'].loc[df['gender']=='M'].count()))
print("Female: {}".format(df['gender'].loc[df['gender']=='F'].count()))
print("Unidentified: {}".format(df['gender'].loc[df['gender']=='0'].count()))

# citizenship
print("\nFatalities by citizenship")
citizenship = list(df['citizenship'].drop_duplicates().values)
# citizenship = ['unidentified' if x=='0' else x for x in citizenship]
for c in citizenship:
    print("{}: {}".format(c, df['citizenship'].loc[df['citizenship']==c].count()))

# type of injury
print("\nTypes of injuries/cause of death")
injury = list(df['type_of_injury'].drop_duplicates().values)
# injury = ['unidentified' if x=='0' else x for x in injury]
for i in injury:
    print("{}: {}".format(i, df['type_of_injury'].loc[df['type_of_injury']==i].count()))

# year of death
print("\nFatalities by year")
year = list(df['year_of_death'].drop_duplicates().values)
# year = ['unidentified' if x=='0' else x for x in year]
for y in year:
    print("{}: {}".format(y, df['year_of_death'].loc[df['year_of_death']==y].count()))

# killed by
print("\nFatalities according to killer")
killer = list(df['killed_by'].drop_duplicates().values)
# killer = ['unidentified' if x=='0' else x for x in killer]
for k in killer:
    print("{}: {}".format(k, df['killed_by'].loc[df['killed_by']==k].count()))

# event location region
print("\nFatalities by region")
location = list(df['event_location_region'].drop_duplicates().values)
# location = ['unidentified' if x=='0' else x for x in location]
for l in location:
    print("{}: {}".format(l, df['event_location_region'].loc[df['event_location_region']==l].count()))

Average age of fatalities: 26.75 years old
Youngest fatality: 1.0 year(s) old
Oldest fatality: 112 years old
Fatalities by age group:
<18 : 2239
18-24 : 3818
25-34 : 2723
35-44 : 1002
45-54 : 616
55-64 : 323
65+ : 274
unidentified : 129

Fatalities by gender
Male: 9681
Female: 1423
Unidentified: 20

Fatalities by citizenship
Palestinian: 10092
Israeli: 1029
Jordanian: 2
American: 1

Types of injuries/cause of death
gunfire: 9849
stabbing: 48
hit by a vehicle: 18
explosion: 555
physical assault: 1
shelling: 311
being bludgeoned with an axe: 4
physically assaulted: 2
beating: 9
stones throwing: 6
Strangulation: 1
0: 291
fire: 4
house demolition: 25

Fatalities by year
2023: 250
2022: 205
2021: 324
2020: 31
2019: 146
2018: 301
2017: 80
2016: 116
2015: 179
2014: 2326
2013: 42
2012: 262
2011: 130
2010: 89
2009: 1059
2008: 900
2007: 400
2006: 681
2005: 240
2004: 922
2003: 732
2002: 1326
2001: 348
2000: 35

Fatalities according to killer
Israeli security forces: 10000
Palestinian civilians: 1

In [None]:
# create new df with years column
df_year = pd.DataFrame(year, columns=['year'])
df_year.sort_values(by=['year'], inplace=True)
df_year.reset_index(drop=True, inplace=True)

# new columns
# total deaths, avg age, top gender, top citizenship, top killer, top injury
death_count = []
avg_age = []
top_gender = []
top_citizenship = []
top_injury = []
top_killer = []
for i in range(len(df_year)):
    year = df_year['year'].iloc[i]

    death_count.append(df['year_of_death'].loc[df['year_of_death']==year].count())
    avg_age.append(df['age'].loc[df['year_of_death']==year].mean())
    top_gender.append(df['gender'].loc[df['year_of_death']==year].mode()[0])
    top_citizenship.append(df['citizenship'].loc[df['year_of_death']==year].mode()[0])
    top_injury.append(df['type_of_injury'].loc[df['year_of_death']==year].mode()[0])
    top_killer.append(df['killed_by'].loc[df['year_of_death']==year].mode()[0])

df_year = df_year.assign(death_count = death_count, avg_age = avg_age,
                         top_gender = top_gender, top_citizenship = top_citizenship,
                         top_injury = top_injury, top_killer = top_killer)
df_year.to_csv('by_year_dataset.csv', index=0)