# To Identify Terrorist Events using Event Triggers

## Imports

### Import Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import Pre-defined Libraries/Packages/Modules/Methods

In [0]:
import pandas as pd
import numpy as np
import spacy as sp
import pickle
import os
from collections import OrderedDict
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import wordnet
# from nltk.stem import WordNetLemmatizer
# from nltk.stem import LancasterStemmer
from nltk.stem import PorterStemmer

### Installations/Loading

In [3]:
!python -m nltk.downloader wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
!python -m spacy download en_core_web_lg
!pip3 install spacy-wordnet


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


### Import Custom Modules

## Global Objects


In [0]:
nlp = sp.load('en_core_web_lg')

In [0]:
HOME_PATH = './drive/My Drive/Colab Notebooks/201916007_PROJECT_IT550'
DATABASES_PATH = HOME_PATH + '/DATABASES'
GTD = DATABASES_PATH + '/globalterrorismdb_0718dist.csv'
RDWTI = DATABASES_PATH + '/RAND_Database_of_Worldwide_Terrorism_Incidents.csv'
INTERMEDIATE_FILES_PATH = HOME_PATH + '/INTERMEDIATE_FILES'
FIRE_DATABASE_FILES_PATH = HOME_PATH + '/FIRE_DATABASE_FILES'
FIRE_DATA_DICT = FIRE_DATABASE_FILES_PATH + '/file_data_dict.txt'
TRIGGERS_PATH = HOME_PATH + '/TRIGGERS'
TERRORIST_EVENTS_UPDATED1_PATH = TRIGGERS_PATH+'/TERRORIST_EVENT_TRIGGERS_UPDATED1.txt'
ENCODING = 'ISO-8859-1'
ENGINE = 'python'
# LEM = WordNetLemmatizer()
# LANCASTER = LancasterStemmer()
PORTER = PorterStemmer()

## Exploratory Data Analysis: GTD, RDWTI, FIRE-DATABASE

### GTD (Global Terrorism Database)

#### Load database

In [0]:
gtd = pd.read_csv(GTD, engine=ENGINE, encoding=ENCODING)

#### EDA

In [8]:
print(gtd.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB
None


In [9]:
print(gtd.head())

        eventid  iyear  imonth  iday  ... INT_IDEO  INT_MISC INT_ANY  related
0  197000000001   1970       7     2  ...        0         0       0      NaN
1  197000000002   1970       0     0  ...        1         1       1      NaN
2  197001000001   1970       1     0  ...       -9         1       1      NaN
3  197001000002   1970       1     0  ...       -9         1       1      NaN
4  197001000003   1970       1     0  ...       -9         1       1      NaN

[5 rows x 135 columns]


In [10]:
print(gtd.columns.values)

['eventid' 'iyear' 'imonth' 'iday' 'approxdate' 'extended' 'resolution'
 'country' 'country_txt' 'region' 'region_txt' 'provstate' 'city'
 'latitude' 'longitude' 'specificity' 'vicinity' 'location' 'summary'
 'crit1' 'crit2' 'crit3' 'doubtterr' 'alternative' 'alternative_txt'
 'multiple' 'success' 'suicide' 'attacktype1' 'attacktype1_txt'
 'attacktype2' 'attacktype2_txt' 'attacktype3' 'attacktype3_txt'
 'targtype1' 'targtype1_txt' 'targsubtype1' 'targsubtype1_txt' 'corp1'
 'target1' 'natlty1' 'natlty1_txt' 'targtype2' 'targtype2_txt'
 'targsubtype2' 'targsubtype2_txt' 'corp2' 'target2' 'natlty2'
 'natlty2_txt' 'targtype3' 'targtype3_txt' 'targsubtype3'
 'targsubtype3_txt' 'corp3' 'target3' 'natlty3' 'natlty3_txt' 'gname'
 'gsubname' 'gname2' 'gsubname2' 'gname3' 'gsubname3' 'motive'
 'guncertain1' 'guncertain2' 'guncertain3' 'individual' 'nperps'
 'nperpcap' 'claimed' 'claimmode' 'claimmode_txt' 'claim2' 'claimmode2'
 'claimmode2_txt' 'claim3' 'claimmode3' 'claimmode3_txt' 'compclaim'


In [11]:
i,j = 0,2
while j <= 135:
  print(gtd.iloc[-5:, i:j])
  i += 2
  j += 2

             eventid  iyear
181686  201712310022   2017
181687  201712310029   2017
181688  201712310030   2017
181689  201712310031   2017
181690  201712310032   2017
        imonth  iday
181686      12    31
181687      12    31
181688      12    31
181689      12    31
181690      12    31
       approxdate  extended
181686        NaN         0
181687        NaN         0
181688        NaN         0
181689        NaN         0
181690        NaN         0
       resolution  country
181686        NaN      182
181687        NaN      200
181688        NaN      160
181689        NaN       92
181690        NaN      160
        country_txt  region
181686      Somalia      11
181687        Syria      10
181688  Philippines       5
181689        India       6
181690  Philippines       5
                        region_txt        provstate
181686          Sub-Saharan Africa  Middle Shebelle
181687  Middle East & North Africa         Lattakia
181688              Southeast Asia      Maguindanao


In [12]:
i,j = 0,2
while j <= 135:
  print(gtd.iloc[:5, i:j])
  i += 2
  j += 2

        eventid  iyear
0  197000000001   1970
1  197000000002   1970
2  197001000001   1970
3  197001000002   1970
4  197001000003   1970
   imonth  iday
0       7     2
1       0     0
2       1     0
3       1     0
4       1     0
  approxdate  extended
0        NaN         0
1        NaN         0
2        NaN         0
3        NaN         0
4        NaN         0
  resolution  country
0        NaN       58
1        NaN      130
2        NaN      160
3        NaN       78
4        NaN      101
          country_txt  region
0  Dominican Republic       2
1              Mexico       1
2         Philippines       5
3              Greece       8
4               Japan       4
                    region_txt provstate
0  Central America & Caribbean       NaN
1                North America   Federal
2               Southeast Asia    Tarlac
3               Western Europe    Attica
4                    East Asia   Fukouka
            city   latitude
0  Santo Domingo  18.456792
1    Mexico ci

In [13]:
print(gtd['suicide'].value_counts())

0    175058
1      6633
Name: suicide, dtype: int64


#### Feature Extraction

In [0]:
GTD_FEATURE_NAMES = ['alternative_txt','attacktype1_txt','attacktype2_txt',
                     'attacktype3_txt','gname','gname2','gname3','gsubname',
                     'gsubname2','gsubname3','scite1','scite2','scite3',
                     'summary','target1','targtype1_txt','targsubtype1_txt',
                     'target2','targtype2_txt','targsubtype2_txt','target3',
                     'targtype3_txt','targsubtype3_txt','weapdetail',
                     'weaptype1_txt','weapsubtype1_txt','weaptype2_txt',
                     'weapsubtype2_txt','weaptype3_txt','weapsubtype3_txt',
                     'weaptype4_txt','weapsubtype4_txt']

In [0]:
gtd_view1 = gtd[GTD_FEATURE_NAMES]

In [0]:
# EXTRACT COLUMNS WHICH HAVE ATTACK TYPES
ATTACK_TYPE_COLS = ['attacktype1_txt','attacktype2_txt','attacktype3_txt']
gtd_view_attacks = gtd_view1.dropna(
    how='all',
    subset=ATTACK_TYPE_COLS)

In [0]:
# EXTRACT UNIQUE ATTACK TYPES
ATTACK_TYPES = [attacktype for attacktype in
                set().union(*[set(gtd_view_attacks[attacktype_col])
                for attacktype_col in ATTACK_TYPE_COLS])
                if attacktype==attacktype]  # We know that, nan != nan

In [18]:
ATTACK_TYPES

['Unknown',
 'Hijacking',
 'Hostage Taking (Kidnapping)',
 'Facility/Infrastructure Attack',
 'Hostage Taking (Barricade Incident)',
 'Armed Assault',
 'Bombing/Explosion',
 'Assassination',
 'Unarmed Assault']

In [0]:
# EXTRACT COLUMNS WHICH HAVE WEAPON TYPES ## 'weapdetail', 
WEAPON_TYPE_COLS = ['weaptype1_txt','weapsubtype1_txt',
                    'weaptype2_txt','weapsubtype2_txt','weaptype3_txt',
                    'weapsubtype3_txt','weaptype4_txt','weapsubtype4_txt']

gtd_view_weapons = gtd_view1.dropna(
    how='all',
    subset=WEAPON_TYPE_COLS)

In [0]:
# EXTRACT UNIQUE WEAPON TYPES
WEAPON_TYPES = [weapontype for weapontype in
                set().union(*[set(gtd_view_weapons[weapontype_col])
                for weapontype_col in WEAPON_TYPE_COLS])
                if weapontype==weapontype] # We know that, nan != nan

In [21]:
WEAPON_TYPES

['Other Gun Type',
 'Firearms',
 'Other',
 'Remote Trigger',
 'Grenade',
 'Melee',
 'Unknown Weapon Type',
 'Unknown',
 'Molotov Cocktail/Petrol Bomb',
 'Suicide (carried bodily by human being)',
 'Chemical',
 'Pressure Trigger',
 'Poisoning',
 'Incendiary',
 'Letter Bomb',
 'Projectile (rockets, mortars, RPGs, etc.)',
 'Automatic or Semi-Automatic Rifle',
 'Rifle/Shotgun (non-automatic)',
 'Explosive',
 'Time Fuse',
 'Rope or Other Strangling Device',
 'Unknown Explosive Type',
 'Sticky Bomb',
 'Explosives',
 'Landmine',
 'Biological',
 'Handgun',
 'Sabotage Equipment',
 'Other Explosive Type',
 'Knife or Other Sharp Object',
 'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
 'Pipe Bomb',
 'Hands, Feet, Fists',
 'Gasoline or Alcohol',
 'Blunt Object',
 'Radiological',
 'Vehicle',
 'Arson/Fire',
 'Dynamite/TNT',
 'Suffocation',
 'Unknown Gun Type',
 'Fake Weapons']

### RDWTI (RAND Database of Wordwide Terrorism Incidents)

#### Load database

In [0]:
rdwti = pd.read_csv(RDWTI, engine=ENGINE, encoding=ENCODING)

#### EDA

In [23]:
print(rdwti.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40129 entries, 0 to 40128
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         40129 non-null  object
 1   City         35155 non-null  object
 2   Country      40129 non-null  object
 3   Perpetrator  40125 non-null  object
 4   Weapon       40126 non-null  object
 5   Injuries     40129 non-null  int64 
 6   Fatalities   40129 non-null  int64 
 7   Description  40128 non-null  object
dtypes: int64(2), object(6)
memory usage: 2.4+ MB
None


In [24]:
print(rdwti.head())

        Date  ...                                        Description
0   9-Feb-68  ...  ARGENTINA.  The second floor of the U.S. embas...
1  12-Feb-68  ...  DOMINICAN REPUBLIC.  A homemade bomb was found...
2  13-Feb-68  ...  URUGUAY.  A Molotov cocktail was thrown outsid...
3  20-Feb-68  ...  CHILE.  An explosion from a single stick of dy...
4  21-Feb-68  ...  UNITED STATES.  The Soviet embassy was bombed ...

[5 rows x 8 columns]


In [25]:
print(rdwti['Description'][0])

ARGENTINA.  The second floor of the U.S. embassy residence was struck by fifteen 9-mm machine-gun slugs fired from a passing automobile during the early morning hours.


In [26]:
print(rdwti['Perpetrator'].value_counts())

Unknown                                          26190
Other                                             2057
Taliban                                           1000
Revolutionary Armed Forces of Colombia (FARC)      616
Hamas (Islamic Resistance Movement)                576
                                                 ...  
al-Fahd al-Aswad Forces                              1
South Londonderry Volunteers                         1
Ogaden National Liberation Front (ONLF)              1
Jordanian Islamic Resistance                         1
June 2  Movement (Bewegung 2 Juni)                   1
Name: Perpetrator, Length: 1059, dtype: int64


In [27]:
print(rdwti['Weapon'].value_counts())

Explosives                    20523
Firearms                      11222
Unknown                        3213
Fire or Firebomb               2778
Remote-detonated explosive     1593
Knives & sharp objects          418
Other                           304
Chemical Agent                   59
Biological Agent                 13
Telecommunication                 1
Bombing                           1
Business                          1
Name: Weapon, dtype: int64


In [28]:
print(rdwti['Injuries'].value_counts())

0       27571
1        3137
2        2092
3        1384
4        1032
        ...  
625         1
178         1
1042        1
84          1
111         1
Name: Injuries, Length: 176, dtype: int64


In [29]:
print(rdwti['Fatalities'].value_counts())

0      24597
1       7965
2       2697
3       1325
4        830
       ...  
115        1
500        1
85         1
117        1
79         1
Name: Fatalities, Length: 121, dtype: int64


In [30]:
print(rdwti['Country'].value_counts())

Iraq                   10763
West Bank/Gaza          2038
Afghanistan             2025
Thailand                2009
Colombia                1913
                       ...  
Iceland                    1
Trucial Oman States        1
Hong Kong                  1
Falkland Islands           1
Taiwan                     1
Name: Country, Length: 195, dtype: int64


#### Feature Extraction

In [0]:
RDWTI_FEATURE_NAMES = ['Description','Weapon','Perpetrator']

In [0]:
rdwti_view1 = rdwti[RDWTI_FEATURE_NAMES]

In [0]:
# EXTRACT UNIQUE WEAPONS
RDWTI_WEAPONS = [weapontype for weapontype in
                 set(rdwti_view1['Weapon'].values)
                 if weapontype == weapontype]

In [34]:
RDWTI_WEAPONS

['Unknown',
 'Chemical Agent',
 'Remote-detonated explosive',
 'Business',
 'Knives & sharp objects',
 'Firearms',
 'Other',
 'Bombing',
 'Explosives',
 'Telecommunication',
 'Biological Agent',
 'Fire or Firebomb']

### FIRE Dataset

#### Load Dataset

In [0]:
firedataset = pickle.load(open(FIRE_DATA_DICT, 'rb'))
firedataset_filedata_list = list(zip(*firedataset.values()))[0]
firedataset_filename_list = list(zip(*firedataset.values()))[1]

In [36]:
firedataset_filename_list[:5]

('1041207_atleisure_index.utf8',
 '1041208_atleisure_index.utf8',
 '1041210_atleisure_index.utf8',
 '1041211_atleisure_index.utf8',
 '1041213_atleisure_index.utf8')

## Load Trigger Dictionary

In [0]:
TERRORIST_EVENTS_UPDATED = pickle.load(open(TERRORIST_EVENTS_UPDATED1_PATH, 'rb'))

In [38]:
TERRORIST_EVENTS_UPDATED.keys()

odict_keys(['Assassination', 'Armed Assault', 'Facility/Infrastructure Attack', 'Unarmed Assault', 'Hijacking', 'Bombing/Explosion', 'Cyberattacks', 'Hostage'])

In [0]:
def classify_file(doc, trigger_dict):
  global PORTER
  all_string = string.ascii_letters+string.digits+\
  string.punctuation+string.whitespace
  for event, trigger_list in trigger_dict.items():
    if trigger_list != []:
      for trigger in trigger_list:
        if trigger:
          if trigger in doc:
            return event
          else:
            trigger_words_list = trigger.split()
            pattern = r'[a-zA-Z0-9 !"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]*'
            new_pattern = pattern.join([PORTER.stem(word) for word in (trigger_words_list)])
            if re.search(new_pattern, doc):
              return event
  return 'Unknown Event'

In [0]:
# test_summary_list = list(gtd[gtd['attacktype1_txt'] == 'Assassination']['summary'])
# attack_type = 'Assassination'

TP_FP_FN_gtd_dict = OrderedDict()
for key in TERRORIST_EVENTS_UPDATED.keys():
  TP_FP_FN_gtd_dict[key] = {'TP':0, 'FP':0, 'FN':0}

In [0]:
def test_summary(attacktype_col, attack_type, test_column):
  global TP_FP_FN_gtd_dict

  if attack_type == 'Hostage':
    gtd1 = gtd[gtd[attacktype_col] == 'Hostage Taking (Barricade Incident)']
    # gtd1 = gtd[gtd[attacktype_col] == 'Hostage Taking (Kidnapping)']
  else:
    gtd1 = gtd[gtd[attacktype_col] == attack_type]

  errors = 0 # correct classification count
  correct = 0 # wrong classification count
  correct_as_attack_type = 0
  misclassified_as = list()
  classified_as = list()

  for idx, row in gtd1.iterrows():
    if row[test_column] == row[test_column]:
      result = classify_file(row[test_column], TERRORIST_EVENTS_UPDATED)
      if result != 'Unknown Event':
        if (result != attack_type) and \
        (result != row['attacktype2_txt']) and \
        (result != row['attacktype3_txt']) :
            errors += 1
            if result not in misclassified_as:
              misclassified_as.append(result)
            TP_FP_FN_gtd_dict[result]['FP'] += 1
            TP_FP_FN_gtd_dict[attack_type]['FN'] += 1
        else:
          if result == attack_type:
            TP_FP_FN_gtd_dict[result]['TP'] += 1
            correct_as_attack_type += 1
          else:
            TP_FP_FN_gtd_dict[attack_type]['FN'] += 1
            if result not in classified_as:
              classified_as.append(result)
          correct += 1
      # else:
        # print(row[test_column], end='\n\n')
        # print('\n\nNot detected\n\n')
  
  sep_len = len('Facility/Infrastructure Attack')
  sep_str = " "*sep_len

  print(f'Attack type                         {sep_str}   : {attack_type}')
  print(f'Correct classification as           {attack_type:30}   : {correct_as_attack_type}')
  print(f'Correct classification              {sep_str}   : {correct}')
  print(f'Other correct classification into   {sep_str}   : {classified_as}')
  print(f'Incorrect classification            {sep_str}   : {errors}')
  print(f'Misclassification into              {sep_str}   : {misclassified_as}')
  
  print('='*100)
  return correct, errors 

In [0]:
### NEW based on all attack type cols of that 
def test(test_column, attacktype_col):
  global TERRORIST_EVENTS_UPDATED
  for attacktype in TERRORIST_EVENTS_UPDATED.keys():
    test_summary(attacktype_col, attacktype, test_column)

In [76]:
attacktype_col = 'attacktype1_txt'
test_column = 'summary'
test(test_column, attacktype_col)

Attack type                                                          : Assassination
Correct classification as           Assassination                    : 405
Correct classification                                               : 409
Other correct classification into                                    : ['Bombing/Explosion', 'Armed Assault']
Incorrect classification                                             : 4269
Misclassification into                                               : ['Armed Assault', 'Bombing/Explosion', 'Hijacking', 'Facility/Infrastructure Attack', 'Hostage', 'Unarmed Assault']
Attack type                                                          : Armed Assault
Correct classification as           Armed Assault                    : 13465
Correct classification                                               : 13635
Other correct classification into                                    : ['Bombing/Explosion', 'Facility/Infrastructure Attack', 'Assassination', 'Hijackin

In [50]:
for key, value in TP_FP_FN_gtd_dict.items():
  try:
    print(key, value)
    precision = value['TP']/(value['TP']+value['FP'])
    recall = value['TP']/(value['TP']+value['FN'])
    f1_score = 2 * (recall * precision) / (recall + precision)
    print('Attack type               :', key)
    print('Precision                 :', precision)
    print('Recall                    :', recall)
    print('F1 Score                  :', f1_score)
  except:
    pass
  finally:
    print('='*100)

Assassination {'TP': 810, 'FP': 330, 'FN': 4277}
Attack type               : Assassination
Precision                 : 0.7105263157894737
Recall                    : 0.1592294082956556
F1 Score                  : 0.26015737915529147
Armed Assault {'TP': 26930, 'FP': 7890, 'FN': 1682}
Attack type               : Armed Assault
Precision                 : 0.773406088454911
Recall                    : 0.9412134768628547
F1 Score                  : 0.8490982469416067
Facility/Infrastructure Attack {'TP': 3746, 'FP': 15158, 'FN': 721}
Attack type               : Facility/Infrastructure Attack
Precision                 : 0.19815911976301312
Recall                    : 0.8385941347660623
F1 Score                  : 0.3205682255787087
Unarmed Assault {'TP': 66, 'FP': 964, 'FN': 96}
Attack type               : Unarmed Assault
Precision                 : 0.06407766990291262
Recall                    : 0.4074074074074074
F1 Score                  : 0.11073825503355705
Hijacking {'TP': 326, 'FP': 1

In [0]:
def trigger_dict_add_triggers(event, trigger):
  global TERRORIST_EVENTS_UPDATED
  global TERRORIST_EVENTS_UPDATED1_PATH
  TERRORIST_EVENTS_UPDATED[event].append(trigger)
  TERRORIST_EVENTS_UPDATED[event] = list(set(TERRORIST_EVENTS_UPDATED[event]))
  TERRORIST_EVENTS_UPDATED[event].sort()
  pickle.dump(TERRORIST_EVENTS_UPDATED, open(TERRORIST_EVENTS_UPDATED1_PATH, 'wb'))
  TERRORIST_EVENTS_UPDATED = pickle.load(open(TERRORIST_EVENTS_UPDATED1_PATH, 'rb'))

In [0]:
event = 'Bombing/Explosion'
trigger = 'detonated explosives'
trigger_dict_add_triggers(event, trigger)
TERRORIST_EVENTS_UPDATED[event]
## TERRORIST_EVENTS_UPDATED[event].remove(trigger)

In [0]:
events_firedataset = list()
for data in firedataset_filedata_list[:]:
  events_firedataset.append(classify_file(data, TERRORIST_EVENTS_UPDATED))

In [88]:
for i in range(1000):
  print(f'{events_firedataset[i]:50} {firedataset_filename_list[i]}\n')

Unknown Event                                      1041207_atleisure_index.utf8

Unknown Event                                      1041208_atleisure_index.utf8

Unknown Event                                      1041210_atleisure_index.utf8

Unknown Event                                      1041211_atleisure_index.utf8

Unknown Event                                      1041213_atleisure_index.utf8

Unknown Event                                      1041213_atleisure_story_4109250.utf8

Unknown Event                                      1041213_atleisure_story_4121739.utf8

Unknown Event                                      1041213_atleisure_story_4121740.utf8

Unknown Event                                      1041213_atleisure_story_4121741.utf8

Unknown Event                                      1041213_atleisure_story_4122608.utf8

Unknown Event                                      1041214_atleisure_index.utf8

Unknown Event                                      1041214_atleisure_

In [0]:
for i in range(len(events_firedataset)):
  with open (INTERMEDIATE_FILES_PATH+'/firedataset_results.txt', 'a') as file_i:
    file_i.write(f'{events_firedataset[i]:50} {firedataset_filename_list[i]}\n')