In [36]:
import os
import pandas as pd
import numpy as np


In [37]:
# Current working directory of the notebook
notebook_directory = os.getcwd()

# Relative path to the dataset from the notebook directory
relative_path = r'..\data\raw\Palestine Body Count.csv'

# Construct the absolute path to the dataset
dataset_path = os.path.normpath(os.path.join(notebook_directory, relative_path))

In [38]:
df = pd.read_csv(dataset_path)
df.head(24)

Unnamed: 0,Year,Month,Palestinians Injuries,Israelis Injuries,Palestinians Killed,Israelis Killed
0,2000.0,DECEMBER,781.0,,51,8
1,2000.0,NOVEMBER,3838.0,,112,22
2,2000.0,OCTOBER,5984.0,,104,10
3,2000.0,SEPTEMBER,,,16,1
4,2001.0,DECEMBER,304.0,,67,36
5,2001.0,NOVEMBER,160.0,,39,14
6,2001.0,OCTOBER,407.0,,89,14
7,2001.0,SEPTEMBER,657.0,,59,13
8,2001.0,AUGUST,502.0,,37,26
9,2001.0,JULY,394.0,,32,10


In [39]:
df.rename(columns=lambda col:col.lower().replace(" ", "_"), inplace=True)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   249 non-null    float64
 1   month                  249 non-null    object 
 2   palestinians_injuries  196 non-null    object 
 3   israelis_injuries      133 non-null    object 
 4   palestinians_killed    250 non-null    object 
 5   israelis_killed        250 non-null    object 
dtypes: float64(1), object(5)
memory usage: 11.9+ KB


In [41]:
df.columns

Index(['year', 'month', 'palestinians_injuries', 'israelis_injuries',
       'palestinians_killed', 'israelis_killed'],
      dtype='object')

In [42]:
df.isnull().sum()

year                       2
month                      2
palestinians_injuries     55
israelis_injuries        118
palestinians_killed        1
israelis_killed            1
dtype: int64

In [43]:
numerical_cols = ["palestinians_injuries", "israelis_injuries", "palestinians_killed", "israelis_killed"]
def format(cols):
    for col in cols:
        df[col] = df[col].str.replace(",", "")

format(cols=numerical_cols)

In [44]:
df["palestinians_injuries"].unique()

array(['781', '3838', '5984', nan, '304', '160', '407', '657', '502',
       '394', '319', '932', '715', '927', '598', '471', '185', '264',
       '388', '353', '186', '374', '299', '181', '523', '870', '429',
       '330', '322', '106', '289', '226', '191', '34', '367', '239',
       '303', '379', '244', '292', '161', '98', '343', '579', '287',
       '251', '377', '545', '346', '417', '437', '168', '166', '99', '81',
       '90', '130', '165', '116', '183', '68', '42', '164', '73', '491',
       '180', '196', '266', '799', '198', '257', '254', '203', '194',
       '88', '13720', '127', '152', '135', '154', '67', '162', '281',
       '115', '153', '256', '104', '195', '30', '80', '36', '26', '89',
       '82', '136', '87', '65', '5557', '105', '147', '148', '46', '85',
       '118', '119', '402', '92', '86', '210', '63', '110', '149', '156',
       '159', '397', '193', '204', '143', '2252', '169', '202', '231',
       '703', '404', '60', '320', '137', '138', '124', '656', '493',
     

In [45]:
df["month"] = df["month"].str.strip()

In [46]:
df["month"].unique()

array(['DECEMBER', 'NOVEMBER', 'OCTOBER', 'SEPTEMBER', 'AUGUST', 'JULY',
       'JUNE', 'MAY', 'APRIL', 'MARCH', 'FEBRUARY', 'JANUARY',
       'MAY & JUNE', nan], dtype=object)

In [47]:
df[df["month"] == 'MAY & JUNE']

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
203,2017.0,MAY & JUNE,(incl. Jun),(incl. Jun),6,0


In [48]:
df["month"] = df["month"].replace("MAY & JUNE", "MAY")

In [49]:
def mod_injuries(value):
    if value in ["(incl. Jun)", "(incl. Aug)"]:
        return 0
    else:
        return value
df["palestinians_injuries"] = df["palestinians_injuries"].apply(mod_injuries)
df["israelis_injuries"] = df["israelis_injuries"].apply(mod_injuries)

In [50]:
df.columns

Index(['year', 'month', 'palestinians_injuries', 'israelis_injuries',
       'palestinians_killed', 'israelis_killed'],
      dtype='object')

In [51]:
df.isnull().sum()

year                       2
month                      2
palestinians_injuries     55
israelis_injuries        118
palestinians_killed        1
israelis_killed            1
dtype: int64

In [52]:
df.fillna(0, inplace=True)

In [53]:
df[df["month"] == 0]

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
249,0.0,0,0,0,0,0
250,0.0,0,111475,5160,10000,1275


In [54]:
df.drop(249, inplace=True)

In [55]:
df[df["month"] == 0]

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
250,0.0,0,111475,5160,10000,1275


In [56]:
intColumns = ["year","palestinians_injuries", "israelis_injuries", "palestinians_killed", "israelis_killed"]
def intype(columns):
    for col in columns:
        df[col] = df[col].astype("int64")
intype(columns=intColumns)
df["year"] = df["year"].astype(str)

In [57]:
df

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
0,2000,DECEMBER,781,0,51,8
1,2000,NOVEMBER,3838,0,112,22
2,2000,OCTOBER,5984,0,104,10
3,2000,SEPTEMBER,0,0,16,1
4,2001,DECEMBER,304,0,67,36
...,...,...,...,...,...,...
245,2021,FEBRUARY,0,0,1,0
246,2021,MARCH,0,0,4,0
247,2021,APRIL,0,0,1,0
248,2021,MAY,0,0,26,3


In [58]:
df.iloc[243: , :]

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
243,2020,JANUARY,0,0,6,0
244,2021,JANUARY,0,0,4,0
245,2021,FEBRUARY,0,0,1,0
246,2021,MARCH,0,0,4,0
247,2021,APRIL,0,0,1,0
248,2021,MAY,0,0,26,3
250,0,0,111475,5160,10000,1275


In [59]:
df.isnull().sum()

year                     0
month                    0
palestinians_injuries    0
israelis_injuries        0
palestinians_killed      0
israelis_killed          0
dtype: int64

In [60]:
df["year"].unique()

array(['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '0'], dtype=object)

In [61]:
df.duplicated().sum()

0

In [62]:
df.head()

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
0,2000,DECEMBER,781,0,51,8
1,2000,NOVEMBER,3838,0,112,22
2,2000,OCTOBER,5984,0,104,10
3,2000,SEPTEMBER,0,0,16,1
4,2001,DECEMBER,304,0,67,36


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 0 to 250
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   year                   250 non-null    object
 1   month                  250 non-null    object
 2   palestinians_injuries  250 non-null    int64 
 3   israelis_injuries      250 non-null    int64 
 4   palestinians_killed    250 non-null    int64 
 5   israelis_killed        250 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 13.7+ KB


In [64]:
df.tail(20)

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
230,2019,FEBRUARY,0,0,8,1
231,2019,JANUARY,0,0,9,0
232,2020,DECEMBER,0,0,2,1
233,2020,NOVEMBER,0,0,2,0
234,2020,OCTOBER,0,0,2,0
235,2020,SEPTEMBER,0,0,1,0
236,2020,AUGUST,0,0,3,1
237,2020,JULY,0,0,1,0
238,2020,JUNE,0,0,1,0
239,2020,MAY,0,0,4,1


In [65]:
df = df.iloc[:-1]

In [66]:
df.head(10)

Unnamed: 0,year,month,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
0,2000,DECEMBER,781,0,51,8
1,2000,NOVEMBER,3838,0,112,22
2,2000,OCTOBER,5984,0,104,10
3,2000,SEPTEMBER,0,0,16,1
4,2001,DECEMBER,304,0,67,36
5,2001,NOVEMBER,160,0,39,14
6,2001,OCTOBER,407,0,89,14
7,2001,SEPTEMBER,657,0,59,13
8,2001,AUGUST,502,0,37,26
9,2001,JULY,394,0,32,10


In [67]:
df.describe()

Unnamed: 0,palestinians_injuries,israelis_injuries,palestinians_killed,israelis_killed
count,249.0,249.0,249.0,249.0
mean,447.690763,20.722892,40.160643,5.120482
std,1466.40772,149.791924,129.148851,11.653323
min,0.0,0.0,0.0,0.0
25%,46.0,0.0,4.0,0.0
50%,160.0,2.0,12.0,1.0
75%,302.0,15.0,37.0,5.0
max,13735.0,2347.0,1590.0,122.0


In [68]:
# Current working directory of the notebook
notebook_directory = os.getcwd()

# Relative path to the dataset from the notebook directory
relative_path = r'..\data\processed\Cleaned_Data.csv'

# Construct the absolute path to the dataset
dataset_path = os.path.normpath(os.path.join(notebook_directory, relative_path))

# Save data into CSV file
df.to_csv(dataset_path)

In [69]:
# Current working directory of the notebook
notebook_directory = os.getcwd()

# Relative path to the dataset from the notebook directory
relative_path = r'..\data\raw\Cleaned_Data.csv'

# Construct the absolute path to the dataset
dataset_path = os.path.normpath(os.path.join(notebook_directory, relative_path))

# Save data into CSV file
df.to_csv(dataset_path)