In [3]:
# Coding challenge for fellowship.AI

# Molly Gibson - September 12, 2017

# data from http://www.start.umd.edu/gtd/contact/ - full GTD dataset
# data downloaded in .xlsx format, so I converted to familiar csv:
# $ pip install csvkit
# $ in2csv datasets/globalterrorismdb_0617dist.xlsx > globalterrorism_db.csv

# Goal of the challenge: use attack type, weapons used, description of the attack, etc. to build a model that
# can predict what group may have been responsible for an incident

# first, install basic dependencies: 
import pandas as pd
import numpy as np

# then upload the GTD data
df = pd.read_csv('globalterrorism_db.csv', encoding='ISO-8859-1', usecols=[1,9,19,20,21,22,25,26,27,28,30,
                                                                                  32,34,40,58,69,71,80,81,83,98,109])

print(df.shape, '\n')

# rename a few columns for readability:
df = df.rename(columns={'iyear':'year', 'gname':'groupname','compclaim':'competingclaims', 'ishostkid':'hostages'})


# get the names of each column along w index so we can pick which features to use
# for idx, col in enumerate(df.columns):
#     print(idx, col)
# don't need after we add usecols=[] to pd.read_csv, but might want to go back to use diff features

# print datatypes of each column
#print(df.dtypes)
print('Features:\n',list(df.columns))

# choosing features: notes to self
# for target types and nationalities, only going to use the first col
# only taking first col of claimed, but also using compclaim (boolean, competing claims from two groups)
# weaptype and weapsubtype, only using first col


# now we need to deal with NA values, which exist in the float64 dtype columns
# 9:unknown for attack types
df.attacktype2 = df.attacktype2.fillna(9).astype(int)
df.attacktype3 = df.attacktype3.fillna(9).astype(int)

df.natlty1 = df.natlty1.fillna(-9).astype(int)

# changing NaN to -9 
df.nperps = df.nperps.fillna(-99).astype(int)
df.claimed, df.competingclaims = df.claimed.fillna(0).astype(int), df.competingclaims.fillna(-9).astype(int)
# 13:unknown for weapon types
df.weaptype1, df.weapsubtype1 = df.weaptype1.fillna(13).astype(int), df.weapsubtype1.fillna(-9).astype(int)
df.nkill, df.hostages = df.nkill.fillna(-9).astype(int), df.hostages.fillna(-9).astype(int)


(170350, 22) 

Features:
 ['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', 'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3', 'targtype1', 'natlty1', 'groupname', 'nperps', 'claimed', 'competingclaims', 'weaptype1', 'weapsubtype1', 'nkill', 'hostages']


In [4]:
print('This dataset contains terrorist attacks from', df.year.min(), 'to', df.year.max()) 

This dataset contains terrorist attacks from 1970 to 2016


In [5]:
# Now let's count the number of recorded attacks by each group
# we will probably only want to consider groups with > n attacks
counts = df[['region','groupname']].groupby(['region','groupname']).size().reset_index(name='count') \
                            .sort_values(['count'],ascending=False)

print('number of terrorist groups in the database: ', len(counts.groupname.unique()))

# only include groups that are responsibile for 5+ attacks
counts = counts[counts['count'] > 4]

print('number of groups we are considering: ', len(counts.groupname.unique()))

# then alter dataframe to only include rows in which groupname appears 5+ times
df = df[df.groupname.isin(counts.groupname)]
print(df.shape)

# dict of each region name corresponding with its number
regdict = {'1':'North America', '2':'Central America & Caribbean', '3':'South America', '4':'East Asia',
          '5':'Southeast Asia', '6':'South Asia', '7':'Central Asia', '8':'Western Europe', '9':'Eastern Europe',
          '10':'Middle East & North Africa', '11':'Sub-Saharan Africa', '12':'Australia & Oceania'}

# separate each region into its own dataframe, stored in dictionary
# DFs - main dataframe divided into regions
# countsbyreg - frequency of groups in each region
DFs = {}
countsbyreg = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs[name] = df[df.region==reg]
    countsbyreg[name] = counts[counts.region==reg]
   

number of terrorist groups in the database:  3454
number of groups we are considering:  870
(166204, 22)


In [6]:
# to get an idea of the top 5 frequently occuring groupnames in each region
# and the number of groups with over 5 attacks in each region 
for key in countsbyreg:
    print(key,'\n\n',len(countsbyreg[key]),' groups with 5+ attacks', '\n', 
          len(DFs[key]),' incidents', '\n\n', 
          round(float(countsbyreg[key]['count'][countsbyreg[key].groupname=='Unknown'])/float(len(DFs[key]))*100, 2),
          '% unknown groupnames', '\n\n', countsbyreg[key].head(),'\n')
    print('-'*100)

Middle East & North Africa 

 170  groups with 5+ attacks 
 45766  incidents 

 60.2 % unknown groupnames 

       region                                    groupname  count
3484      10                                      Unknown  27550
3102      10  Islamic State of Iraq and the Levant (ISIL)   4260
3184      10               Kurdistan Workers' Party (PKK)   1980
3306      10                                 Palestinians   1104
2854      10     Al-Qaida in the Arabian Peninsula (AQAP)    972 

----------------------------------------------------------------------------------------------------
South Asia 

 195  groups with 5+ attacks 
 40927  incidents 

 50.11 % unknown groupnames 

       region                                       groupname  count
1719       6                                         Unknown  20508
1642       6                                         Taliban   6574
1292       6  Communist Party of India - Maoist (CPI-Maoist)   1766
1459       6         Liberation 

In [12]:
# 'Unknown' is the most common groupname in every region
print("Total percentage of attacks by unknown terrorist groups: ", 
      round(float(df.groupname[df.groupname=='Unknown'].count() / df.groupname.count())*100,3))

# first off, let's see if we can classify unknown vs. known attacks
# in order to do so, add a new boolean column to the dataframe
df['is_unknown'] = np.where(df.groupname == 'Unknown', True, False)

# separate the unlabeled attacks from the labeled ones; can use known attacks to check accuracy of our models but 
# ultimately the goal is to predict which groups are responsible for the unknown attacks
unknown_df = df[df.groupname == 'Unknown']

known_df = df[df.groupname != 'Unknown']

print(known_df.shape)

Total percentage of attacks by unknown terrorist groups:  47.114
(87898, 23)


In [36]:
# now onto sklearn!
from time import time
from operator import itemgetter
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# suppress warnings
import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)




# and create a df for accuracy of our algorithms in each region
results = pd.DataFrame(columns=['Region', 'Accuracy'])

# and a dictionary to compare different algorithms, holding a df of accuracies for each one   
classifier_accuracies = {}
classifier_accuracies_lbld = {}


# let's divide the dataset into regions 
training_X = {}
training_y = {}
testing_X = {}
testing_y = {}

for key in DFs:
    training_X[key], testing_X[key], training_y[key], testing_y[key] = train_test_split(DFs[key].drop('groupname', axis=1),
                                                                                       DFs[key].groupname,
                                                                                       test_size=0.3, random_state=15)



In [14]:
# trying out a few different classification algorithms to see which performs the best
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=20),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]


# for name, clf in zip(names, classifiers):
#     for key, n in zip(training_y, np.arange(len(training_y))):
#         clf.fit(training_X[key], training_y[key])
#         results.loc[n, 'Region'] = key
#         results.loc[n, 'Accuracy'] = round(metrics.accuracy_score(testing_y[key], clf.predict(testing_X[key])), 4)
#     classifier_accuracies[name] = results.copy()

# %store classifier_accuracies

%store -r classifier_accuracies

for alg in classifier_accuracies:
    print(alg, '\n', classifier_accuracies[alg].sort_values('Accuracy', ascending=False), '\n')

Nearest Neighbors 
                          Region Accuracy
10                 Central Asia   0.8667
11          Australia & Oceania   0.7857
9                     East Asia   0.7593
6   Central America & Caribbean   0.7428
0    Middle East & North Africa   0.7168
7                Eastern Europe   0.7073
2                Southeast Asia   0.6763
3            Sub-Saharan Africa   0.6627
1                    South Asia   0.6532
4                 South America   0.6513
5                Western Europe   0.6409
8                 North America   0.5464 

Decision Tree 
                          Region Accuracy
10                 Central Asia   0.8933
11          Australia & Oceania      0.8
9                     East Asia   0.7685
6   Central America & Caribbean    0.757
7                Eastern Europe   0.7189
0    Middle East & North Africa   0.7106
1                    South Asia   0.6549
2                Southeast Asia   0.6405
4                 South America   0.6113
5                We

In [23]:
# from now on let's only use attacks with labeled groupnames, excluding 'Unknown' groups

DFs_knowngnames = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs_knowngnames[name] = known_df[known_df.region==reg]
    
for key in DFs_knowngnames:
    training_X[key], testing_X[key], training_y[key], testing_y[key] = train_test_split(DFs_knowngnames[key].drop('groupname', axis=1),
                                                                                       DFs_knowngnames[key].groupname,
                                                                                       test_size=0.3, random_state=15)

# for name, clf in zip(names, classifiers):
#     print(name)
#     for key, n in zip(training_y, np.arange(len(training_y))):
#         clf.fit(training_X[key], training_y[key])
#         results.loc[n, 'Region'] = key
#         results.loc[n, 'Accuracy'] = round(metrics.accuracy_score(testing_y[key], clf.predict(testing_X[key])), 4)
#     print(results.sort_values('Accuracy', ascending=False))
#     classifier_accuracies_lbld[name] = results.copy()

#%store classifier_accuracies_lbld

%store -r classifier_accuracies_lbld

for alg in classifier_accuracies_lbld:
    print(alg, '\n', classifier_accuracies_lbld[alg].sort_values('Accuracy', ascending=False), '\n')

Nearest Neighbors 
                          Region Accuracy
6   Central America & Caribbean   0.7774
3            Sub-Saharan Africa   0.7627
9                     East Asia   0.7051
4                 South America   0.6888
1                    South Asia   0.6763
0    Middle East & North Africa    0.671
7                Eastern Europe   0.6667
2                Southeast Asia   0.6484
11          Australia & Oceania   0.6316
5                Western Europe   0.6229
8                 North America     0.47
10                 Central Asia   0.3889 

Decision Tree 
                          Region Accuracy
6   Central America & Caribbean   0.8266
9                     East Asia   0.7051
7                Eastern Europe   0.6972
3            Sub-Saharan Africa   0.6955
4                 South America   0.6585
1                    South Asia   0.6435
11          Australia & Oceania   0.6316
0    Middle East & North Africa    0.606
2                Southeast Asia   0.5925
5                We

In [80]:
# create a list of most common groups in each region
# so that we only are choosing between the 5 main groups, or else 'other'

def top_or_other(gname, toplist):
    if gname in toplist:
        return gname
    else:
        return 'Other'

# setting with copy warning; don't know how to fix that quickly
top_groups = {}
for key in countsbyreg:
    top_groups[key] = list(countsbyreg[key]['groupname'][1:6])
    DFs_knowngnames[key].loc[:, 'gname'] = DFs_knowngnames[key].loc[:,'groupname'] \
                .apply(lambda name: top_or_other(name, top_groups[key]))
    

    
for key in DFs_knowngnames:
    training_X[key], testing_X[key], training_y[key], testing_y[key] = train_test_split(DFs_knowngnames[key]. \
                                                                                    drop(['groupname','gname','region'], axis=1),
                                                                                    DFs_knowngnames[key].gname,
                                                                                    test_size=0.3, random_state=7)

clf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_split=10, random_state=666)

for key, n in zip(training_y, np.arange(len(training_y))):
    clf.fit(training_X[key], training_y[key])
    #print(training_y[key].unique())
    results.loc[n, 'Region'] = key
    results.loc[n, 'Accuracy'] = round(metrics.accuracy_score(testing_y[key], clf.predict(testing_X[key])), 4)


print(results.sort_values('Accuracy', ascending=False))

print('\nMean Accuracy for all regions (exluding Central Asia): ', round(results.sort_values('Accuracy', ascending=False).Accuracy[:-1].mean(), 4))

                         Region Accuracy
11          Australia & Oceania   0.9474
3            Sub-Saharan Africa   0.9317
6   Central America & Caribbean   0.9079
0    Middle East & North Africa   0.9061
1                    South Asia   0.8689
8                 North America   0.8525
9                     East Asia   0.8205
5                Western Europe   0.8095
4                 South America   0.7961
2                Southeast Asia   0.7945
7                Eastern Europe   0.7582
10                 Central Asia      0.5

Mean Accuracy for all regions (exluding Central Asia):  0.8539


In [25]:
# EDIT: ADDED LATER
# to test out the different algorithms after reducing the dataset to only 6 features
# (['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1', 'region'])
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=6),
    MLPClassifier(alpha=1)]
# needed to change max_features for RF

for name, clf in zip(names, classifiers):
    print('\n',name)
    for key, n in zip(training_y, np.arange(len(training_y))):
        clf.fit(training_X[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1', 'region']], training_y[key])
        results.loc[n, 'Region'] = key
        results.loc[n, 'Accuracy'] = round(metrics.accuracy_score(testing_y[key], clf.predict(testing_X[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1', 'region']])), 4)
    print(results.sort_values('Accuracy', ascending=False).copy())

# random forest still performs the best; accuracy improved for every alg, though!


 Nearest Neighbors
                         Region Accuracy
6   Central America & Caribbean   0.8121
3            Sub-Saharan Africa   0.7732
4                 South America   0.6896
0    Middle East & North Africa   0.6706
1                    South Asia   0.6577
7                Eastern Europe   0.6492
11          Australia & Oceania   0.6316
2                Southeast Asia   0.6292
5                Western Europe   0.6165
9                     East Asia   0.6154
8                 North America   0.4654
10                 Central Asia   0.3333

 Decision Tree
                         Region Accuracy
6   Central America & Caribbean   0.8209
7                Eastern Europe   0.7015
3            Sub-Saharan Africa   0.6951
9                     East Asia   0.6923
11          Australia & Oceania   0.6842
4                 South America   0.6524
1                    South Asia   0.6443
0    Middle East & North Africa   0.6086
2                Southeast Asia   0.6082
5                West

In [20]:
# random forest seems to give the best results, so let's explore it further!
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=78, max_features=5, min_samples_split=10)


# let's do a gridsearch to try and optimize our parameters; first make a dict of params to try
param_grid = {"max_depth": [3, None],
              "max_features": [1, 5, 20],
              "min_samples_split": [3, 10],
              "min_samples_leaf": [1, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# from sklearn docs:
def report(grid_scores, n_top=5):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print(" ")

# split up training and testing sets: (just classifying known vs unknown here)
X_train, X_test, y_train_bool, y_test_bool = train_test_split(df.drop(['groupname', 'is_unknown'], axis=1), df.is_unknown, 
                                                    test_size=0.3, random_state=7)
# these training/testing dataframes are a hodgepodge of all the regions; later we'll check difference 
# between each region

# commenting this out bc it takes a really long time
# grid_search = GridSearchCV(clf, param_grid=param_grid)
# start = time()

# grid_search.fit(X_train, y_train_bool)

# print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
#       % (time() - start, len(grid_search.grid_scores_)))

# grid_scores = grid_search.grid_scores_

# %store grid_scores

%store -r grid_scores

report(grid_scores)

# 0.847 is the max. accuracy we can get with random forest to classify labeled vs unlabeled attacks

Model with rank: 1
Mean validation score: 0.847 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 2
Mean validation score: 0.846 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 3
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 4
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 5
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_

In [21]:
clf.fit(X_train, y_train_bool)

preds = clf.predict(X_test)

print('Random Forest accuracy when classifying known vs. unknown: ', round(metrics.accuracy_score(y_test_bool, preds), 4))
# this is higher than the last cell bc of increased n_estimators

scores = cross_val_score(clf, X_train, y_train_bool)
print('Cross Val Score: ', round(scores.mean(),4))

Random Forest accuracy when classifying known vs. unknown:  0.8526
Cross Val Score:  0.8465


In [62]:
from collections import defaultdict

weights = defaultdict(list)

# now let's run some algorithms on only the labeled points - i.e. only predict groups where we can test our accuracy
# our goal is to predict which groups are responsible for the unlabeled attacks
X_train_labld, X_test_labld, y_train_labld, y_test_labld = train_test_split(known_df.drop('groupname', axis=1), known_df.groupname, 
                                                    test_size=0.3, random_state=7)

print(X_train_labld.shape)

newclf.fit(X_train_labld, y_train_labld)

labld_preds = newclf.predict(X_test_labld)

print('Random Forest accuracy when predicting groupname, only known groups: ', round(metrics.accuracy_score(y_test_labld, labld_preds), 4))

# let's see how significant each feature is (going to need to get rid of some)
for featr, wght in zip(list(X_train_labld.columns), newclf.feature_importances_):
    weights[featr].append(wght)

for feat in weights:
    weights[feat] = sum(weights[feat])/float(len(weights))
    
print('\nNumber of features in full dataset: ', len(X_train_labld.columns))
weights = pd.DataFrame(list(weights.items()), columns=['feature', 'weight'])
print('\n',weights.sort_values('weight', ascending=False))

(61528, 22)
Random Forest accuracy when predicting groupname, only known groups:  0.7523

Number of features in full dataset:  22

             feature    weight
13          natlty1  0.017975
0              year  0.008357
1            region  0.007099
12        targtype1  0.002636
18     weapsubtype1  0.001911
19            nkill  0.001700
9       attacktype1  0.001088
5         doubtterr  0.000847
14           nperps  0.000793
17        weaptype1  0.000761
6          multiple  0.000679
15          claimed  0.000546
4             crit3  0.000269
20         hostages  0.000224
7           success  0.000197
10      attacktype2  0.000109
8           suicide  0.000105
16  competingclaims  0.000083
3             crit2  0.000038
2             crit1  0.000027
11      attacktype3  0.000010
21       is_unknown  0.000000


In [26]:
# need to trim down features, obviously
# so let's use SelectFromModel to identify which features are most important
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(newclf)

sfm.fit(X_train_labld, y_train_labld)

trimmedcols = list(X_train_labld.columns[sfm.get_support(indices=True)])

trimmed_X = pd.DataFrame(sfm.transform(X_train_labld), columns=trimmedcols)
trimmed_test_X = pd.DataFrame(sfm.transform(X_test_labld), columns=trimmedcols)

# now making a new clf to fit smaller dataframes
rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, min_samples_leaf=3, min_samples_split=3,
                          random_state=10, n_jobs=-1)

rf_clf.fit(trimmed_X, y_train_labld)

print('Selected features for trimmed dataset: ', trimmedcols)

for featr in zip(list(trimmed_X.columns), rf_clf.feature_importances_):
    print(featr)
    
# nationality of victims has importance of almost 50% ! 

Selected features for trimmed dataset:  ['year', 'region', 'targtype1', 'natlty1']
('year', 0.23043031705376724)
('region', 0.18925527085238247)
('targtype1', 0.096534940035296946)
('natlty1', 0.48377947205855348)


In [27]:
# let's see how the model performs on the trimmed data
newpreds = rf_clf.predict(trimmed_test_X)

print('Random Forest accuracy with trimmed features: ', round(metrics.accuracy_score(y_test_labld, newpreds), 4))

# the accuracy only decreased by ~4%, but using 4 features instead of 22! 

Random Forest accuracy with trimmed features:  0.7115


In [65]:
from sklearn.feature_selection import SelectFromModel

# now let's select important features for each region
clf_regions = RandomForestClassifier(n_estimators=200, random_state=15)

newsfm = SelectFromModel(clf_regions)

results_byregion = pd.DataFrame(columns=['Region', 'Accuracy', 'Num Features'])

for key, n in zip(training_y, np.arange(len(training_y))):
    newsfm.fit(training_X[key], training_y[key])
    cols = list(training_X[key].columns[newsfm.get_support(indices=True)])
    clf = RandomForestClassifier(n_estimators=50, random_state=15, n_jobs=-1)
    Xtrain = pd.DataFrame(newsfm.transform(training_X[key]), columns=cols)
    Xtest = pd.DataFrame(newsfm.transform(testing_X[key]), columns=cols)
    clf.fit(Xtrain, training_y[key])
    results_byregion.loc[n, 'Region'] = key
    results_byregion.loc[n, 'Accuracy'] = round(metrics.accuracy_score(testing_y[key], clf.predict(Xtest)), 4)
    results_byregion.loc[n, 'Num Features'] = int(len(cols))

print(results_byregion.sort_values('Accuracy', ascending=False))

                         Region Accuracy Num Features
3            Sub-Saharan Africa   0.9188            5
11          Australia & Oceania   0.8947            6
0    Middle East & North Africa   0.8906            5
6   Central America & Caribbean   0.8887            4
1                    South Asia   0.8474            5
8                 North America   0.8264            5
9                     East Asia   0.8205            9
5                Western Europe    0.794            4
2                Southeast Asia   0.7644            6
4                 South America   0.7567            4
7                Eastern Europe   0.7473            5
10                 Central Asia   0.2778            9


In [29]:
# how could we improve this further? what are the limitations?

# it was relatively arbitrary that I decided to only consider groups with 5+ attacks; could try altering this 
# to see effects on accuracy
# could try: keep all incidents in the data, but groups with < n attacks go into an 'Other' label

# why is the accuracy in some regions so low? for central/east Asia and Australia, much less data 
# and a very high % of unknown attacks

# the goal is to predict 'Unknown' attacks, but can't check accuracy on the unlabeled attacks 