# Previsão de Ataques de Civis e Ocorrência de Sub-eventos durante o conflito entre Ucrânia e Rússia 
### Autores: Amarilda Chihepe, Ludmila Mucavele, Daniel Muaquiua 

### Pacotes

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Introdução

### Importação dos Datasets

In [2]:
df1 = pd.read_csv('./Datasets/conflict_data_ukr-14-21.csv')
df2 = pd.read_csv('./Datasets/UkraineConflict-18-23.csv')

In [3]:
df1.head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,,,#date+year,,,,,,,,...,#date+end,,,,,#affected+killed,,,,
1,147997.0,UKR-2014-1-14085-0,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,5.0,0.0,0.0,5,5.0,5.0,369.0,
2,149221.0,UKR-2014-1-14085-1,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,1.0,1,1.0,1.0,369.0,
3,149248.0,UKR-2014-1-14085-3,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,10.0,4.0,0.0,12.0,26,26.0,26.0,369.0,
4,149249.0,UKR-2014-1-14085-4,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,2.0,2,2.0,2.0,369.0,


In [4]:
df2.head()

Unnamed: 0,EVENT_ID_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,...,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP
0,ROU448,20-May-19,2019,1,Political violence,Violence against civilians,Attack,Police Forces of Romania (2016-2019) Coast Guard,,1,...,Coast of Constanta,44.156,28.948,2,Deschide; Hurriyet Daily; News.ro; CNN; TRT Haber,National-International,"On 20 May 2019, the Coast Guard of Romania fir...",0,,1649875498
1,ROU1885,28-Mar-22,2022,1,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1,...,Coast of Constanta,44.156,28.948,1,Adevarul; G4media,National,"Defusal: On 28 March 2022, Romanian minesweepe...",0,,1649184809
2,ROU1940,28-Jul-22,2022,1,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6,...,Coast of Constanta,44.156,28.948,1,News.ro,National,"On 28 July 2022, Greenpeace activists proteste...",0,crowd size=no report,1659462993
3,ROU1945,31-Jul-22,2022,1,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1,...,Coast of Constanta,44.156,28.948,1,Digi24,National,"Defusal: On 31 July 2022, Romanian Naval Force...",0,,1660055880
4,ROU1947,4-Aug-22,2022,1,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6,...,Coast of Constanta,44.156,28.948,1,News.ro,National,"On 4 August 2022, Greenpeace activists protest...",0,crowd size=no report,1660055882


## Data Assessing

* **Dataset 1 (Conflicto de 2014 a 2021)**

In [5]:
print(df1.shape)
df1.info()

(2990, 50)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2990 entries, 0 to 2989
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2989 non-null   float64
 1   relid              2989 non-null   object 
 2   year               2990 non-null   object 
 3   active_year        2989 non-null   float64
 4   code_status        2989 non-null   object 
 5   type_of_violence   2989 non-null   float64
 6   conflict_dset_id   2989 non-null   float64
 7   conflict_new_id    2989 non-null   float64
 8   conflict_name      2989 non-null   object 
 9   dyad_dset_id       2989 non-null   float64
 10  dyad_new_id        2989 non-null   float64
 11  dyad_name          2989 non-null   object 
 12  side_a_dset_id     2989 non-null   float64
 13  side_a_new_id      2989 non-null   float64
 14  side_a             2990 non-null   object 
 15  side_b_dset_id     2989 non-null   float64
 16  side_b_new_id

In [6]:
df1.duplicated().sum()

0

In [7]:
df1.isnull().sum()

id                      1
relid                   1
year                    0
active_year             1
code_status             1
type_of_violence        1
conflict_dset_id        1
conflict_new_id         1
conflict_name           1
dyad_dset_id            1
dyad_new_id             1
dyad_name               1
side_a_dset_id          1
side_a_new_id           1
side_a                  0
side_b_dset_id          1
side_b_new_id           1
side_b                  0
number_of_sources       1
source_article          0
source_office           1
source_date             1
source_headline         0
source_original        19
where_prec              1
where_coordinates       0
where_description     311
adm_1                 228
adm_2                 707
latitude                0
longitude               0
geom_wkt                1
priogrid_gid            1
country                 0
iso3                    0
country_id              1
region                  0
event_clarity           1
date_prec   

In [8]:
# looking for numer of uninque values per feature
df1.nunique()

id                   2989
relid                2989
year                    9
active_year             2
code_status             1
type_of_violence        2
conflict_dset_id        5
conflict_new_id         5
conflict_name           5
dyad_dset_id            7
dyad_new_id             7
dyad_name               5
side_a_dset_id          2
side_a_new_id           2
side_a                  3
side_b_dset_id          7
side_b_new_id           7
side_b                  6
number_of_sources      14
source_article       2323
source_office         279
source_date          1508
source_headline      1817
source_original       812
where_prec              7
where_coordinates     402
where_description    1569
adm_1                   6
adm_2                  65
latitude              422
longitude             424
geom_wkt              424
priogrid_gid           32
country                 2
iso3                    2
country_id              1
region                  2
event_clarity           2
date_prec   

* **Dataset 1 (Conflicto de 2018 a 2023)**

In [9]:
print(df2.shape)

# general information of df2
df2.info()

(96082, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96082 entries, 0 to 96081
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EVENT_ID_CNTY       96082 non-null  object 
 1   EVENT_DATE          96082 non-null  object 
 2   YEAR                96082 non-null  int64  
 3   TIME_PRECISION      96082 non-null  int64  
 4   DISORDER_TYPE       96082 non-null  object 
 5   EVENT_TYPE          96082 non-null  object 
 6   SUB_EVENT_TYPE      96082 non-null  object 
 7   ACTOR1              96082 non-null  object 
 8   ASSOC_ACTOR_1       6488 non-null   object 
 9   INTER1              96082 non-null  int64  
 10  ACTOR2              51829 non-null  object 
 11  ASSOC_ACTOR_2       14678 non-null  object 
 12  INTER2              96082 non-null  int64  
 13  INTERACTION         96082 non-null  int64  
 14  CIVILIAN_TARGETING  4188 non-null   object 
 15  ISO                 96082 non-null  int64

In [10]:
# looking for null values from ACTOR2 in df2
df2[df2['ACTOR2'].isna()].ACTOR2

2        NaN
4        NaN
14       NaN
21       NaN
23       NaN
        ... 
96070    NaN
96071    NaN
96072    NaN
96073    NaN
96074    NaN
Name: ACTOR2, Length: 44253, dtype: object

## Data Cleaning

### Observations

* First index of df1 
* Year of df1 as object type
* Missing values in Actor 2 feature from df2
* Datapoints from 2018-2021 not needed in df1
* `deaths_a`, `deaths_b`, `deaths_civilians`, `deaths_unknown` features values
* All df2 features in Uppercase
* Features to delete from df1:
    * id
    * relid
    * active_year
    * code_status
    * code_status
    * code_status
    * conflict_new_id
    * conflict_name
    * dyad_dset_id
    * dyad_new_id
    * dyad_name
    * side_a_dset_id
    * side_a_new_id
    * side_b_dset_id
    * side_b_dset_id
    * number_of_sources
    * source_article
    * source_article
    * source_date
    * source_headline
    * source_original
    * where_prec
    * where_description
    * geom_wkt
    * priogrid_gid
    * country_id
    * event_clarity
    * date_prec
    * date_start
    * date_end
    * date_end
    * deaths_b
    * deaths_civilians
    * deaths_civilians
    * best
    * high
    * low
    * gwnoa
    * gwnob
* Features to delete from df2:
    * EVENT_ID_CNTY
    * EVENT_DATE
    * TIME_PRECISION
    * SOURCE
    * SOURCE_SCALE
    * NOTES
    * TAGS
    * TIMESTAMP
* Merge datasets

## Data Cleaning

* Drop first index of df1 

#### Code

In [11]:
df1.drop(0, axis = 0, inplace = True)
df1.reset_index(drop = True, inplace = True)

#### Test

In [12]:
df1.head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,147997.0,UKR-2014-1-14085-0,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,5.0,0.0,0.0,5,5.0,5.0,369.0,
1,149221.0,UKR-2014-1-14085-1,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,1.0,1,1.0,1.0,369.0,
2,149248.0,UKR-2014-1-14085-3,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,10.0,4.0,0.0,12.0,26,26.0,26.0,369.0,
3,149249.0,UKR-2014-1-14085-4,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,2.0,2,2.0,2.0,369.0,
4,149251.0,UKR-2014-1-14085-6,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,6.0,6,6.0,6.0,369.0,


* Change `year` of df1 to integer type

#### Code

In [13]:
df1.year = df1.year.astype(int)

#### Test

In [14]:
df1.year.dtype

dtype('int32')

* Imputing missing values in `Actor2` feature from df2 with 'None'

#### Code

In [15]:
df2.loc[df2['ACTOR2'].isna(), 'ACTOR2'] = 'None'

#### Test

In [16]:
df2[df2['ACTOR2'] == 'None'].shape

(44253, 31)

* Delete datapoints from 2018-2021 in df1

#### Code

In [17]:
index = df1[df1.year >= 2018].index.values
df1.drop(index, axis = 0, inplace = True)
df1.shape

(2325, 50)

#### Test

In [18]:
df1[df1.year >= 2018]

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob


* Join `deaths_a`, `deaths_b`, `deaths_civilians`, `deaths_unknown` features values as `fatalities`

#### Code

In [19]:
# joining columns values into one new column
a = df1.deaths_a.values
b = df1.deaths_b.values
c = df1.deaths_civilians.values
d = df1.deaths_unknown.values
df1['fatalities'] = a + b + c + d

# dropping the columns
df1.drop(columns = ['deaths_a', 'deaths_b', 'deaths_civilians', 'deaths_unknown'], inplace = True)

#### Test

In [20]:
print(df1.shape)
df1['fatalities']

(2325, 47)


0        5.0
1        1.0
2       26.0
3        2.0
4        6.0
        ... 
2828     1.0
2829     4.0
2830     1.0
2831     1.0
2832     1.0
Name: fatalities, Length: 2325, dtype: float64

*  Delete features from df1
    * id
    * relid
    * active_year
    * code_status 
    * conflict_new_id
    * conflict_name
    * dyad_dset_id
    * dyad_new_id
    * dyad_name
    * side_a_dset_id
    * side_a_new_id
    * side_b_dset_id
    * side_b_dset_id
    * number_of_sources
    * source_article
    * source_article
    * source_date
    * source_headline
    * source_original
    * where_prec
    * where_description
    * geom_wkt
    * priogrid_gid
    * country_id
    * event_clarity
    * date_prec
    * date_start
    * date_end
    * date_end
    * deaths_b
    * deaths_civilians
    * deaths_civilians
    * best
    * high
    * low
    * gwnoa
    * gwnob

#### Code

In [21]:
index = list(df1.columns[[0, 1]]) + df1.columns[3:8].tolist() + df1.columns[9:14].tolist() + list(df1.columns[[15, 16]]) + df1.columns[18:25].tolist() + [df1.columns[26]] + [df1.columns[31]] + [df1.columns[32]] + df1.columns[34:46].tolist()
df1.drop(columns=index, inplace=True)

#### Test

In [22]:
df1.columns

Index(['year', 'conflict_name', 'side_a', 'side_b', 'where_coordinates',
       'adm_1', 'adm_2', 'latitude', 'longitude', 'country', 'fatalities'],
      dtype='object')

*  Delete features from df2:
    * EVENT_ID_CNTY
    * EVENT_DATE
    * TIME_PRECISION
    * SOURCE
    * SOURCE_SCALE
    * NOTES
    * TAGS
    * TIMESTAMP

#### Code

In [23]:
columns = ['EVENT_ID_CNTY', 'EVENT_DATE', 'TIME_PRECISION', 'SOURCE', 'SOURCE_SCALE', 'NOTES', 'TAGS', 'TIMESTAMP', 'REGION', 'ISO', 'GEO_PRECISION']
df2.drop(columns = columns, inplace=True)

#### Test

In [24]:
df2.columns

Index(['YEAR', 'DISORDER_TYPE', 'EVENT_TYPE', 'SUB_EVENT_TYPE', 'ACTOR1',
       'ASSOC_ACTOR_1', 'INTER1', 'ACTOR2', 'ASSOC_ACTOR_2', 'INTER2',
       'INTERACTION', 'CIVILIAN_TARGETING', 'COUNTRY', 'ADMIN1', 'ADMIN2',
       'ADMIN3', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'FATALITIES'],
      dtype='object')

* Lowcase all df2 features

#### Code

In [25]:
df2.columns = df2.columns.str.lower()

#### Test

In [26]:
df2.columns

Index(['year', 'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'country', 'admin1', 'admin2',
       'admin3', 'location', 'latitude', 'longitude', 'fatalities'],
      dtype='object')

* Merge datasets