# Previsão de Ataques de Civis e Ocorrência de Sub-eventos durante o conflito entre Ucrânia e Rússia 
### Autores: Amarilda Chihepe, Ludmila Mucavele, Daniel Muaquiua 

### Pacotes

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Introdução


### Importação dos Datasets

In [2]:
df1 = pd.read_csv('./Datasets/conflict_data_ukr-14-21.csv')
df2 = pd.read_csv('./Datasets/UkraineConflict-18-23.csv')

In [3]:
df1.head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,,,#date+year,,,,,,,,...,#date+end,,,,,#affected+killed,,,,
1,147997.0,UKR-2014-1-14085-0,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,5.0,0.0,0.0,5,5.0,5.0,369.0,
2,149221.0,UKR-2014-1-14085-1,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,1.0,1,1.0,1.0,369.0,
3,149248.0,UKR-2014-1-14085-3,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,10.0,4.0,0.0,12.0,26,26.0,26.0,369.0,
4,149249.0,UKR-2014-1-14085-4,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,2.0,2,2.0,2.0,369.0,


In [4]:
df2.head()

Unnamed: 0,EVENT_ID_CNTY,EVENT_DATE,YEAR,TIME_PRECISION,DISORDER_TYPE,EVENT_TYPE,SUB_EVENT_TYPE,ACTOR1,ASSOC_ACTOR_1,INTER1,...,LOCATION,LATITUDE,LONGITUDE,GEO_PRECISION,SOURCE,SOURCE_SCALE,NOTES,FATALITIES,TAGS,TIMESTAMP
0,ROU448,20-May-19,2019,1,Political violence,Violence against civilians,Attack,Police Forces of Romania (2016-2019) Coast Guard,,1,...,Coast of Constanta,44.156,28.948,2,Deschide; Hurriyet Daily; News.ro; CNN; TRT Haber,National-International,"On 20 May 2019, the Coast Guard of Romania fir...",0,,1649875498
1,ROU1885,28-Mar-22,2022,1,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1,...,Coast of Constanta,44.156,28.948,1,Adevarul; G4media,National,"Defusal: On 28 March 2022, Romanian minesweepe...",0,,1649184809
2,ROU1940,28-Jul-22,2022,1,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6,...,Coast of Constanta,44.156,28.948,1,News.ro,National,"On 28 July 2022, Greenpeace activists proteste...",0,crowd size=no report,1659462993
3,ROU1945,31-Jul-22,2022,1,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1,...,Coast of Constanta,44.156,28.948,1,Digi24,National,"Defusal: On 31 July 2022, Romanian Naval Force...",0,,1660055880
4,ROU1947,4-Aug-22,2022,1,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6,...,Coast of Constanta,44.156,28.948,1,News.ro,National,"On 4 August 2022, Greenpeace activists protest...",0,crowd size=no report,1660055882


## Data Assessing

* **Dataset 1 (Conflicto de 2014 a 2021)**

In [5]:
print(df1.shape)
df1.info()

(2990, 50)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2990 entries, 0 to 2989
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 2989 non-null   float64
 1   relid              2989 non-null   object 
 2   year               2990 non-null   object 
 3   active_year        2989 non-null   float64
 4   code_status        2989 non-null   object 
 5   type_of_violence   2989 non-null   float64
 6   conflict_dset_id   2989 non-null   float64
 7   conflict_new_id    2989 non-null   float64
 8   conflict_name      2989 non-null   object 
 9   dyad_dset_id       2989 non-null   float64
 10  dyad_new_id        2989 non-null   float64
 11  dyad_name          2989 non-null   object 
 12  side_a_dset_id     2989 non-null   float64
 13  side_a_new_id      2989 non-null   float64
 14  side_a             2990 non-null   object 
 15  side_b_dset_id     2989 non-null   float64
 16  side_b_new_id

In [6]:
df1.duplicated().sum()

0

In [7]:
df1.isnull().sum()

id                      1
relid                   1
year                    0
active_year             1
code_status             1
type_of_violence        1
conflict_dset_id        1
conflict_new_id         1
conflict_name           1
dyad_dset_id            1
dyad_new_id             1
dyad_name               1
side_a_dset_id          1
side_a_new_id           1
side_a                  0
side_b_dset_id          1
side_b_new_id           1
side_b                  0
number_of_sources       1
source_article          0
source_office           1
source_date             1
source_headline         0
source_original        19
where_prec              1
where_coordinates       0
where_description     311
adm_1                 228
adm_2                 707
latitude                0
longitude               0
geom_wkt                1
priogrid_gid            1
country                 0
iso3                    0
country_id              1
region                  0
event_clarity           1
date_prec   

In [8]:
# looking for numer of uninque values per feature
df1.nunique()

id                   2989
relid                2989
year                    9
active_year             2
code_status             1
type_of_violence        2
conflict_dset_id        5
conflict_new_id         5
conflict_name           5
dyad_dset_id            7
dyad_new_id             7
dyad_name               5
side_a_dset_id          2
side_a_new_id           2
side_a                  3
side_b_dset_id          7
side_b_new_id           7
side_b                  6
number_of_sources      14
source_article       2323
source_office         279
source_date          1508
source_headline      1817
source_original       812
where_prec              7
where_coordinates     402
where_description    1569
adm_1                   6
adm_2                  65
latitude              422
longitude             424
geom_wkt              424
priogrid_gid           32
country                 2
iso3                    2
country_id              1
region                  2
event_clarity           2
date_prec   

* **Dataset 2 (Conflicto de 2018 a 2023)**

In [9]:
print(df2.shape)

# general information of df2
df2.info()

(96082, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96082 entries, 0 to 96081
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EVENT_ID_CNTY       96082 non-null  object 
 1   EVENT_DATE          96082 non-null  object 
 2   YEAR                96082 non-null  int64  
 3   TIME_PRECISION      96082 non-null  int64  
 4   DISORDER_TYPE       96082 non-null  object 
 5   EVENT_TYPE          96082 non-null  object 
 6   SUB_EVENT_TYPE      96082 non-null  object 
 7   ACTOR1              96082 non-null  object 
 8   ASSOC_ACTOR_1       6488 non-null   object 
 9   INTER1              96082 non-null  int64  
 10  ACTOR2              51829 non-null  object 
 11  ASSOC_ACTOR_2       14678 non-null  object 
 12  INTER2              96082 non-null  int64  
 13  INTERACTION         96082 non-null  int64  
 14  CIVILIAN_TARGETING  4188 non-null   object 
 15  ISO                 96082 non-null  int64

In [10]:
# looking for null values from ACTOR2 in df2
df2['ACTOR2'].isna().sum()

44253

In [11]:
# looking for disorder type values 
df1.conflict_name.value_counts()

Ukraine: Novorossiya                                                              2032
Ukraine: Donetsk                                                                   656
Ukraine: Lugansk                                                                   294
Ukraine: Government                                                                  6
Supporters of independence for Eastern Ukraine - Supporters of Ukrainian unity       1
Name: conflict_name, dtype: int64

In [12]:
df1.side_a.value_counts()

Government of Ukraine                             2988
#group+name+first                                    1
Supporters of independence for Eastern Ukraine       1
Name: side_a, dtype: int64

In [13]:
# looking for disorder type values 
df2.DISORDER_TYPE.value_counts()

Political violence                    87884
Demonstrations                         5799
Strategic developments                 2381
Political violence; Demonstrations       18
Name: DISORDER_TYPE, dtype: int64

In [14]:
# looking for associated actors
df2.ASSOC_ACTOR_2.value_counts()

Donbass People's Militia                                                                                         9155
Luhansk People's Militia                                                                                         2825
Donbass People's Militia; Civilians (Ukraine)                                                                     638
Civilians (Ukraine)                                                                                               272
Luhansk People's Militia; Civilians (Ukraine)                                                                     161
                                                                                                                 ... 
Lawyers (Ukraine); Former Government of Ukraine (2019-)                                                             1
European Solidarity; Government of Ukraine (2019-)                                                                  1
Refugees/IDPs (Afghanistan); Civilians (Syria); Refugees

### Observations

* First index of df1
* Year of df1 as object type
* Missing values in Actor 2 feature from df2
* Datapoints from 2018-2021 not needed in df1
* `deaths_a`, `deaths_b`, `deaths_civilians`, `deaths_unknown` features values
* All df2 features in Uppercase
* Features to delete from df1:
    * id
    * relid
    * active_year
    * code_status
    * code_status
    * code_status
    * conflict_new_id
    * conflict_name
    * dyad_dset_id
    * dyad_new_id
    * dyad_name
    * side_a_dset_id
    * side_a_new_id
    * side_b_dset_id
    * side_b_dset_id
    * number_of_sources
    * source_article
    * source_article
    * source_date
    * source_headline
    * source_original
    * where_prec
    * where_description
    * geom_wkt
    * priogrid_gid
    * country_id
    * event_clarity
    * date_prec
    * date_start
    * date_end
    * date_end
    * deaths_b
    * deaths_civilians
    * deaths_civilians
    * best
    * high
    * low
    * gwnoa
    * gwnob
* Features to delete from df2:
    * EVENT_ID_CNTY
    * EVENT_DATE
    * TIME_PRECISION
    * SOURCE
    * SOURCE_SCALE
    * NOTES
    * TAGS
    * TIMESTAMP
* Unconcatenated datasets
* Too many associated actors in a column.
* `civilian_targeting` non-binary type

## Data Cleaning

* Drop first index of df1 

#### Code

In [15]:
df1.drop(0, axis = 0, inplace = True)
df1.reset_index(drop = True, inplace = True)

#### Test

In [16]:
df1.head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
0,147997.0,UKR-2014-1-14085-0,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,5.0,0.0,0.0,5,5.0,5.0,369.0,
1,149221.0,UKR-2014-1-14085-1,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,1.0,1,1.0,1.0,369.0,
2,149248.0,UKR-2014-1-14085-3,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,10.0,4.0,0.0,12.0,26,26.0,26.0,369.0,
3,149249.0,UKR-2014-1-14085-4,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,2.0,2,2.0,2.0,369.0,
4,149251.0,UKR-2014-1-14085-6,2014,1.0,Clear,1.0,13219.0,13219.0,Ukraine: Government,14085.0,...,00:00.0,0.0,0.0,0.0,6.0,6,6.0,6.0,369.0,


* Change `year` of df1 to integer type

#### Code

In [17]:
df1.year = df1.year.astype('int64')

#### Test

In [18]:
df1.year.dtype

dtype('int64')

* Imputing missing values in `Actor2` feature from df2 with 'None'

#### Code

In [19]:
df2.loc[df2['ACTOR2'].isna(), 'ACTOR2'] = 'None'

#### Test

In [20]:
df2[df2['ACTOR2'] == 'None'].shape

(44253, 31)

* Delete datapoints from 2018-2021 in df1

#### Code

In [21]:
index = df1[df1.year >= 2018].index.values
df1.drop(index, axis = 0, inplace = True)
df1.shape

(2325, 50)

#### Test

In [22]:
df1[df1.year >= 2018]

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob


* Join `deaths_a`, `deaths_b`, `deaths_civilians`, `deaths_unknown` features values as `fatalities`

#### Code

In [23]:
# joining columns values into one new column
a = df1.deaths_a.values
b = df1.deaths_b.values
c = df1.deaths_civilians.values
d = df1.deaths_unknown.values
df1['fatalities'] = a + b + c + d

# dropping the columns
df1.drop(columns = ['deaths_a', 'deaths_b', 'deaths_civilians', 'deaths_unknown'], inplace = True)

#### Test

In [24]:
print(df1.shape)
df1['fatalities']

(2325, 47)


0        5.0
1        1.0
2       26.0
3        2.0
4        6.0
        ... 
2828     1.0
2829     4.0
2830     1.0
2831     1.0
2832     1.0
Name: fatalities, Length: 2325, dtype: float64

*  Delete features from df1
    * id
    * relid
    * active_year
    * code_status 
    * conflict_new_id
    * conflict_name
    * dyad_dset_id
    * dyad_new_id
    * dyad_name
    * side_a_dset_id
    * side_a_new_id
    * side_b_dset_id
    * side_b_dset_id
    * number_of_sources
    * source_article
    * source_article
    * source_date
    * source_headline
    * source_original
    * where_prec
    * where_description
    * geom_wkt
    * priogrid_gid
    * country_id
    * event_clarity
    * date_prec
    * date_start
    * date_end
    * date_end
    * deaths_b
    * deaths_civilians
    * deaths_civilians
    * best
    * high
    * low
    * gwnoa
    * gwnob

#### Code

In [25]:
index = list(df1.columns[[0, 1]]) + df1.columns[3:8].tolist() + df1.columns[9:14].tolist() + list(df1.columns[[15, 16]]) + df1.columns[18:25].tolist() + [df1.columns[26]] + [df1.columns[31]] + [df1.columns[32]] + df1.columns[34:46].tolist()
df1.drop(columns=index, inplace=True)

#### Test

In [26]:
df1.columns

Index(['year', 'conflict_name', 'side_a', 'side_b', 'where_coordinates',
       'adm_1', 'adm_2', 'latitude', 'longitude', 'country', 'fatalities'],
      dtype='object')

*  Delete features from df2:
    * EVENT_ID_CNTY
    * EVENT_DATE
    * TIME_PRECISION
    * SOURCE
    * SOURCE_SCALE
    * NOTES
    * TAGS
    * TIMESTAMP

#### Code

In [27]:
columns = ['EVENT_ID_CNTY', 'EVENT_DATE', 'TIME_PRECISION', 'SOURCE', 'SOURCE_SCALE', 'NOTES', 'TAGS', 'TIMESTAMP', 'REGION', 'ISO', 'GEO_PRECISION']
df2.drop(columns = columns, inplace=True)

#### Test

In [28]:
df2.columns

Index(['YEAR', 'DISORDER_TYPE', 'EVENT_TYPE', 'SUB_EVENT_TYPE', 'ACTOR1',
       'ASSOC_ACTOR_1', 'INTER1', 'ACTOR2', 'ASSOC_ACTOR_2', 'INTER2',
       'INTERACTION', 'CIVILIAN_TARGETING', 'COUNTRY', 'ADMIN1', 'ADMIN2',
       'ADMIN3', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'FATALITIES'],
      dtype='object')

* Lowcase all df2 features

#### Code

In [29]:
df2.columns = df2.columns.str.lower()

#### Test

In [30]:
df2.columns

Index(['year', 'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'country', 'admin1', 'admin2',
       'admin3', 'location', 'latitude', 'longitude', 'fatalities'],
      dtype='object')

* Concatenate datasets

#### Code

In [31]:
df1.columns

Index(['year', 'conflict_name', 'side_a', 'side_b', 'where_coordinates',
       'adm_1', 'adm_2', 'latitude', 'longitude', 'country', 'fatalities'],
      dtype='object')

In [32]:
df2.columns

Index(['year', 'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'country', 'admin1', 'admin2',
       'admin3', 'location', 'latitude', 'longitude', 'fatalities'],
      dtype='object')

In [33]:
df1.rename(columns = {'side_a' : 'actor1', 'side_b' : 'actor2', 'where_coordinates' : 'location', 'adm_1': 'admin1',
                     'adm_2': 'admin2', 'conflict_name': 'disorder_type'}, inplace = True)

In [34]:
# replacing actor terms  Supporters
df1.disorder_type = df1.disorder_type.apply(lambda x: 'Demonstrations' if 'Supporters' in x else 'Political violence')
df1.disorder_type.value_counts()

Political violence    2324
Demonstrations           1
Name: disorder_type, dtype: int64

In [35]:
# replacing actor terms
df1.actor1 = df1.actor1.apply(lambda x: 'Military Forces of Ukraine (2014-2019)' if 'Government of Ukraine' in x else x)
df1.actor2 = df1.actor2.apply(lambda x: 'Military Forces of Ukraine (2014-2019)' if 'Government of Ukraine' in x else x)
df1.actor1.value_counts()

Military Forces of Ukraine (2014-2019)            2324
Supporters of independence for Eastern Ukraine       1
Name: actor1, dtype: int64

In [36]:
# replacing actor terms
df1.actor1 = df1.actor1.apply(lambda x: 'NAF: United Armed Forces of Novorossiya' if 'United Armed Forces of Novorossiya' in x else x)
df1.actor2 = df1.actor2.apply(lambda x: 'NAF: United Armed Forces of Novorossiya' if 'United Armed Forces of Novorossiya' in x else x)
df1.actor2.value_counts()

DPR                                        1066
NAF: United Armed Forces of Novorossiya     870
LPR                                         382
Maidan                                        6
Supporters of Ukrainian unity                 1
Name: actor2, dtype: int64

In [37]:
# Spilt the string and keeping only the first associate actor
df2.assoc_actor_1 = df2.assoc_actor_1.apply(lambda x: str(x).split(';')[0])
df2.assoc_actor_2 = df2.assoc_actor_2.apply(lambda x: str(x).split(';')[0])
df2.assoc_actor_1.value_counts()

nan                                                                 89594
Military Forces of Russia (2000-)                                    1064
National Corps Party                                                  865
Labor Group (Ukraine)                                                 614
Military Forces of Russia (2000-) Air Force                           361
                                                                    ...  
Police Forces of Ukraine (2014-2019) Special Tasks Patrol Police        1
Police Forces of Russia (2000-) Centre for Combating Extremism          1
Journalists (International)                                             1
KPU: Communist Party of Ukraine                                         1
Civil Control                                                           1
Name: assoc_actor_1, Length: 175, dtype: int64

In [38]:
# Imputing 'nan' value with 'None' 
df2.assoc_actor_1 = df2.assoc_actor_1.apply(lambda x: 'None' if 'nan' in x else x)
df2.assoc_actor_2 = df2.assoc_actor_2.apply(lambda x: 'None' if 'nan' in x else x)
df2.assoc_actor_1.value_counts()

None                                                                89594
Military Forces of Russia (2000-)                                    1064
National Corps Party                                                  865
Labor Group (Ukraine)                                                 614
Military Forces of Russia (2000-) Air Force                           361
                                                                    ...  
Police Forces of Ukraine (2014-2019) Special Tasks Patrol Police        1
Police Forces of Russia (2000-) Centre for Combating Extremism          1
Journalists (International)                                             1
KPU: Communist Party of Ukraine                                         1
Civil Control                                                           1
Name: assoc_actor_1, Length: 175, dtype: int64

In [39]:
# Concatenating the two dataframes
df = pd.concat([df2, df1], axis = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98407 entries, 0 to 2832
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   year                98407 non-null  int64  
 1   disorder_type       98407 non-null  object 
 2   event_type          96082 non-null  object 
 3   sub_event_type      96082 non-null  object 
 4   actor1              98407 non-null  object 
 5   assoc_actor_1       96082 non-null  object 
 6   inter1              96082 non-null  float64
 7   actor2              98407 non-null  object 
 8   assoc_actor_2       96082 non-null  object 
 9   inter2              96082 non-null  float64
 10  interaction         96082 non-null  float64
 11  civilian_targeting  4188 non-null   object 
 12  country             98407 non-null  object 
 13  admin1              98295 non-null  object 
 14  admin2              97978 non-null  object 
 15  admin3              93680 non-null  object 
 16  locat

#### Test

In [40]:
print(df.shape)
df.columns

(98407, 20)


Index(['year', 'disorder_type', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'civilian_targeting', 'country', 'admin1', 'admin2',
       'admin3', 'location', 'latitude', 'longitude', 'fatalities'],
      dtype='object')

* Impute `None` or `0` to `NaN` values

#### Code

In [41]:
def isNaNtoNone(column):
    column = column.apply(lambda x: 'None' if pd.isna(x) else x)
    return column
def isNaNto0(column):
    column = column.apply(lambda x: int(0) if pd.isna(x) else x)
    return column

In [42]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = isNaNtoNone(df[i])
    else:
        df[i] = isNaNto0(df[i])

#### Test

In [43]:
df.head()

Unnamed: 0,year,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,actor2,assoc_actor_2,inter2,interaction,civilian_targeting,country,admin1,admin2,admin3,location,latitude,longitude,fatalities
0,2019,Political violence,Violence against civilians,Attack,Police Forces of Romania (2016-2019) Coast Guard,,1.0,Civilians (Turkey),Fishers (Turkey),7.0,17.0,Civilian targeting,Romania,Constanta,,,Coast of Constanta,44.156,28.948,0.0
1,2022,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1.0,Unidentified Military Forces,,8.0,18.0,,Romania,Constanta,,,Coast of Constanta,44.156,28.948,0.0
2,2022,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6.0,,,0.0,60.0,,Romania,Constanta,,,Coast of Constanta,44.156,28.948,0.0
3,2022,Strategic developments,Strategic developments,Disrupted weapons use,Military Forces of Romania (2021-),,1.0,Unidentified Armed Group (International),,3.0,13.0,,Romania,Constanta,,,Coast of Constanta,44.156,28.948,0.0
4,2022,Demonstrations,Protests,Peaceful protest,Protesters (Romania),Greenpeace,6.0,,,0.0,60.0,,Romania,Constanta,,,Coast of Constanta,44.156,28.948,0.0


* Transform `civilian_targeting` to binary type and transform `latitude` and `longitude` to float type

#### Code

In [44]:
# transform civilian_targeting to binary type
df.civilian_targeting = df.civilian_targeting.apply(lambda x: '1' if 'Civilian targeting' in x else '0')
df.civilian_targeting = df.civilian_targeting.astype(int)

# transform latitude and longitude to float type
df.latitude = df.latitude.astype(float)
df.longitude = df.longitude.astype(float)

#### Test

In [45]:
print(df.latitude.dtype)
df.civilian_targeting.value_counts()

float64


0    94219
1     4188
Name: civilian_targeting, dtype: int64

### Saving Cleaned Dataset

In [46]:
df.to_csv('Ukraine_Conflict_cleaned-14-23.csv', index = False)

## Pre-Processing and Normalization

In [47]:
# listing categorical features
categorical_features = ['disorder_type', 'event_type', 'sub_event_type', 'actor1', 'assoc_actor_1', 'actor2', 'assoc_actor_2',
                        'country', 'admin1', 'admin2', 'admin3', 'location']
numeric_features = ['year', 'civilian_targeting', 'inter1', 'inter2', 'interaction', 'latitude', 'longitude', 'fatalities']
# 2 e -4/16

* Remove Puntuaction

In [48]:
import re

def remove_punct(text):
    pattern = r'\(|\)|-|/'
    return re.sub(pattern, ' ', text)

In [49]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] =  df[i].apply(lambda x: remove_punct(x))

* Convert text to lowercase

In [50]:
# Convert text to lowercase
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    return text

In [51]:
# Apply normalization to all string columns
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = df[i].apply(normalize_text)

* Tokenizing and Padding sequences

In [52]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

# Fit the Tokenizer on the text data
for i in categorical_features:
    tokenizer.fit_on_texts(df[i]) #creating vocabulary

# Tokenize each text feature in the DataFrame
sequences = {} # Dictionary to store sequences for each feature
for i in categorical_features:
    sequences[i] = tokenizer.texts_to_sequences(df[i]) # converting to sequences
    

In [69]:
# Pad sequences to match the same length

#data_dict = df.to_dict(orient='dict')
#data = []
max_length = 20
x_categorical_tensors, y_categorical_tensors = [], []
x_numerical_tensors, y_numerical_tensors = [], []

for feature, seq in sequences.items():
    padded_seq = pad_sequences(seq, maxlen = max_length, padding = 'post')
    tensor_sequences = tf.constant(padded_seq, dtype = tf.float32)
    mask = tf.cast(tf.math.not_equal(padded_seq, 0), dtype = tf.float32)  # Create a mask tensor based on non-zero values
    if feature != 'sub_event_type':
        x_categorical_tensors.append(tensor_sequences)
    else:
        y_categorical_tensors.append(tensor_sequences)
        
for feature in numeric_features:
    tensor = tf.constant(df[feature].values, dtype = tf.float32)
    if feature != 'civilian_targeting':
        x_numerical_tensors.append(tf.reshape(tensor, (-1, 1)))
    else:
        y_numerical_tensors.append(tf.reshape(tensor, (-1, 1)))
    
# Concatenate tensors along the appropriate axis
x_categorical_tensor = tf.concat(x_categorical_tensors, axis = 1)
y_categorical_tensor = tf.concat(y_categorical_tensors, axis = 1 )

import tensorflow as tf

x_numerical_tensor = tf.concat(x_numerical_tensors, axis= 1)
y_numerical_tensor = tf.concat(y_numerical_tensors, axis= 1)


In [55]:
x_categorical_tensor

<tf.Tensor: shape=(98407, 220), dtype=float32, numpy=
array([[ 7.,  3.,  0., ...,  0.,  0.,  0.],
       [49., 50.,  0., ...,  0.,  0.,  0.],
       [39.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 7.,  3.,  0., ...,  0.,  0.,  0.],
       [ 7.,  3.,  0., ...,  0.,  0.,  0.],
       [ 7.,  3.,  0., ...,  0.,  0.,  0.]], dtype=float32)>

In [95]:
# Determine the maximum sequence length for each feature
max_lenghts = {}
max_length = 20

# Determine the input_dim and input_length
input_dim = len(tokenizer.word_index) + 1
input_length = max_length

output_dim = 200
num_numerical_input = 7 #numeric features
num_sequence_features = 11 #sequence features

print("Input Dimension:", input_dim)
print("Input Length:", input_length)

Input Dimension: 3973
Input Length: 20


### Split Data for Training and Testing

In [None]:
x_features = ['year', 'disorder_type', 'event_type', 'actor1', 'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2', 
              'interaction', 'country', 'admin1', 'admin2', 'admin3', 'location', 'latitude', 'longitude', 'fatalities']
y_features = ['sub_event_type', 'civilian_targeting']

In [80]:
from sklearn.model_selection import train_test_split

x_categorical_tensor = np.array(x_categorical_tensor)
y_categorical_tensor = np.array(y_categorical_tensor)
x_numerical_tensor = np.array(x_numerical_tensor)
y_numerical_tensor = np.array(y_numerical_tensor)

# Split categorical tensors
x_categorical_train, x_categorical_test, y_categorical_train, y_categorical_test = train_test_split(
    x_categorical_tensor, y_categorical_tensor, train_size=0.8, random_state=42)

# Split numerical tensors
x_numerical_train, x_numerical_test, y_numerical_train, y_numerical_test = train_test_split(
    x_numerical_tensor, y_numerical_tensor, train_size = 0.8, random_state =42 )

# Split masks
#x_mask_train, x_mask_test, y_mask_train, y_mask_test = train_test_split(
#x_mask, y_mask, train_size = 0.8, random_state = 42)


# Reshape to fit the model
x_categorical_train = np.reshape(x_categorical_train, (-1, 11, 20))
x_categorical_test = np.reshape(x_categorical_test, (-1, 11, 20))

y_categorical_train = np.reshape(y_categorical_train, (-1, 20))
y_categorical_test = np.reshape(y_categorical_test, (-1, 20))


In [94]:
y_categorical_train.shape

(78725, 20)

### Recurrent Neural Network

Using RNN (Recurrent Neural Network), which can handle categorical variables without explicit one-hot encoding. Its architecture can often process categorical variables as integer or ordinal values directly.

In [121]:
from keras.models import Model, Sequential
from keras.layers import Embedding, LSTM, Dense, Input, concatenate, Reshape, Masking, RepeatVector, GlobalAveragePooling1D

# Sequential input for text sequences
input_text = Input(shape=(num_sequence_features, input_length,))
embedding = Embedding(input_dim, output_dim)(input_text)
reshaped_embedding = Reshape((input_length, num_sequence_features * output_dim))(embedding)
lstm = LSTM(units=128, return_sequences=True)(reshaped_embedding)
lstm = LSTM(units=128, return_sequences=True)(lstm)  # Another LSTM layer

# Apply masking to the LSTM output
masked_lstm = Masking(mask_value=0.0)(lstm)

# Add GlobalAveragePooling1D layer
pooled_lstm = GlobalAveragePooling1D()(masked_lstm)

# Numerical input
input_numerical = Input(shape=(num_numerical_input,))

# Expand the dimensions of the numerical input
#expanded_numerical = Dense(units=output_dim)(input_numerical)
#expanded_numerical = RepeatVector(input_length)(expanded_numerical)

# Concatenate text LSTM output and numerical input
concatenated = concatenate([pooled_lstm, input_numerical])

# Additional Dense layers for classification
dense1 = Dense(units=64, activation='relu')(concatenated)
dense1 = Dense(units=64, activation='relu')(dense1)  # Another dense layer

# Output for subevent_type prediction
subevent_output = Dense(units=20, activation='softmax',  name='subevent_output')(dense1)

# Output for civilian_targeting prediction
civilian_output = Dense(units=1, activation='sigmoid', name='civilian_output')(dense1)

# Create the model
model = Model(inputs=[input_text, input_numerical], outputs=[subevent_output, civilian_output])

# Print the model summary
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_17 (InputLayer)          [(None, 11, 20)]     0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 11, 20, 200)  794600      ['input_17[0][0]']               
                                                                                                  
 reshape_8 (Reshape)            (None, 20, 2200)     0           ['embedding_8[0][0]']            
                                                                                                  
 lstm_16 (LSTM)                 (None, 20, 128)      1192448     ['reshape_8[0][0]']              
                                                                                            



What the model structure means:

The Embedding layer is added as the first layer of the model. It takes three main parameters:

* input_dim: The total number of unique categories in the categorical feature. This value determines the size of the input space for the Embedding layer.
* output_dim: The dimensionality of the dense embedding vectors. It determines the size of the output space for the Embedding layer, i.e., the size of the learned embedding vectors.
* input_length: The length of the input sequences. This should match the length of your encoded categorical features.

During model training, the Embedding layer learns to map each integer-encoded category to a dense vector of size output_dim. These learned embeddings are updated based on the model's optimization process and are used as input to the subsequent layers (e.g., LSTM layer).

In [92]:
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

(None, 11, 20) <dtype: 'float32'>
(None, 7) <dtype: 'float32'>
(None, 20) <dtype: 'float32'>
(None, 1) <dtype: 'float32'>
input_5 [(None, 11, 20)] float32
embedding_2 (None, 11, 20) float32
reshape_2 (None, 11, 20, 100) float32
lstm_4 (None, 20, 1100) float32
lstm_5 (None, 20, 128) float32
masking_2 (None, 20, 128) float32
global_average_pooling1d_2 (None, 20, 128) float32
input_6 [(None, 7)] float32
concatenate_2 [(None, 128), (None, 7)] float32
dense_4 (None, 135) float32
dense_5 (None, 128) float32
subevent_output (None, 128) float32
civilian_output (None, 128) float32


[None, None, None, None, None, None, None, None, None, None, None, None, None]

#### Compilation and Optimization of the Model

In [124]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

model.compile(loss={'subevent_output': SparseCategoricalCrossentropy(), 'civilian_output': 'binary_crossentropy'},
              loss_weights={'subevent_output': 1.0, 'civilian_output': 1.0},
              optimizer= Adam(0.0001),
              metrics=['accuracy'])

#### Model Training

In [125]:
# Train the model
#model.fit(x_train, y_train, batch_size=64, epochs=32, validation_data=(x_test, y_test))
# Fit the model
model.fit([x_categorical_train, x_numerical_train], [y_categorical_train, y_numerical_train],
                    validation_data=([x_categorical_test, x_numerical_test], [y_categorical_test, y_numerical_test]),
                    epochs=10, batch_size=32)


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "C:\Anaconda\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Anaconda\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Anaconda\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Anaconda\lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
      app.start()
    File "C:\Anaconda\lib\site-packages\ipykernel\kernelapp.py", line 711, in start
      self.io_loop.start()
    File "C:\Anaconda\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Anaconda\lib\asyncio\base_events.py", line 601, in run_forever
      self._run_once()
    File "C:\Anaconda\lib\asyncio\base_events.py", line 1905, in _run_once
      handle._run()
    File "C:\Anaconda\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Anaconda\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "C:\Anaconda\lib\site-packages\ipykernel\ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "C:\Anaconda\lib\site-packages\ipykernel\zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "C:\Anaconda\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Anaconda\lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\ludmi\AppData\Local\Temp\ipykernel_13428\3781772110.py", line 4, in <module>
      model.fit([x_categorical_train, x_numerical_train], [y_categorical_train, y_numerical_train],
    File "C:\Anaconda\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 994, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Anaconda\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
      return self.compiled_loss(
    File "C:\Anaconda\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Anaconda\lib\site-packages\keras\losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Anaconda\lib\site-packages\keras\losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Anaconda\lib\site-packages\keras\losses.py", line 2084, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "C:\Anaconda\lib\site-packages\keras\backend.py", line 5630, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
logits and labels must have the same first dimension, got logits shape [32,20] and labels shape [640]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_194493]

In [None]:
# Evaluate the model
#loss, accuracy = model.evaluate(x_test, y_test)
loss, subevent_loss, civilian_loss, subevent_accuracy, civilian_accuracy = model.evaluate([x_categorical_test, x_numerical_test],
                                                                                          [y_categorical_test, y_numerical_test],
                                                                                          verbose=0)
print("Loss:", loss)
print("Accuracy:", accuracy)


In [None]:
predictions = model.predict(x_test)