# Cleaning Global Terrorism Dataset

In [1]:
import pandas as pd
import yaml

In [2]:
try:
    with open("./../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading the config file')

Error reading the config file


## Reading the Data

In [3]:
data = pd.read_excel('../data/raw/data.xlsx', engine="openpyxl")

In [4]:
display(data.head())

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,NaT,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,NaT,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,NaT,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,NaT,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,NaT,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


## Dropping Data and Replacing Missing Values

In [5]:
#Dropping columns not relevant for the analysis
columns_to_keep = ['eventid','iyear','country_txt','city', 'region_txt','attacktype1_txt','targtype1_txt', 'weaptype1_txt', 'gname', 'nkill','nwound']
data = data[columns_to_keep]

In [6]:
#Checking for missing values
data.isnull().sum()

eventid                0
iyear                  0
country_txt            0
city                 427
region_txt             0
attacktype1_txt        0
targtype1_txt          0
weaptype1_txt          0
gname                  0
nkill              12527
nwound             19936
dtype: int64

In [7]:
#Replacing NaN with '0' for nkill, nkillter, nwound, nwoundte because there are no recorded deaths or injuries
data['nkill'].fillna(0, inplace=True)
data['nwound'].fillna(0, inplace=True)

In [12]:
#Replace NaN in city column with 'Unknown'
data['city'].fillna('Unknown', inplace=True)

In [13]:
#Check that all missing values have been handled
data.isnull().sum()

eventid            0
iyear              0
country_txt        0
city               0
region_txt         0
attacktype1_txt    0
targtype1_txt      0
weaptype1_txt      0
gname              0
nkill              0
nwound             0
dtype: int64

## Checking Data Types

In [14]:
#Checking to ensure that the data types are representative
display(data.dtypes)

eventid              int64
iyear                int64
country_txt         object
city                object
region_txt          object
attacktype1_txt     object
targtype1_txt       object
weaptype1_txt       object
gname               object
nkill              float64
nwound             float64
dtype: object

In [15]:
#Changing data types from float to integers
data['nkill'] = data['nkill'].astype(int)
data['nwound'] = data['nwound'].astype(int)

## Changing Column Names 

In [20]:
data = data.rename(columns={'eventid': 'event_id', 'iyear': 'year', 'country_txt': 'country', 'region_txt': 'region', 'attacktype1_txt': 'attack_type', 'targtype1_txt': 'target_type', 'weaptype1_txt': 'weapon_type', 'gname': 'group_name', 'nwound': 'number_wounded', 'nkill': 'number_killed'})

## Saving Cleaned Data

In [21]:
data.to_csv("../data/cleaned/cleaned_data.csv", index=False)