In [1]:
# Coding challenge for fellowship.AI

# Molly Gibson - September 12, 2017

# data from http://www.start.umd.edu/gtd/contact/ - full GTD dataset
# data downloaded in .xlsx format, so I converted to familiar csv:
# $ pip install csvkit
# $ in2csv datasets/globalterrorismdb_0617dist.xlsx > globalterrorism_db.csv

# Goal of the challenge: use attack type, weapons used, description of the attack, etc. to build a model that
# can predict what group may have been responsible for an incident

# first, install basic dependencies: 
import pandas as pd
import numpy as np

# then upload the GTD data
df = pd.read_csv('datasets/globalterrorism_db.csv', encoding='ISO-8859-1', usecols=[1,9,19,20,21,22,25,26,27,28,30,
                                                                                  32,34,40,58,69,71,80,81,83,98,109])

print(df.shape, '\n')

# rename a few columns for readability:
df = df.rename(columns={'iyear':'year', 'gname':'groupname','compclaim':'competingclaims', 'ishostkid':'hostages'})


# get the names of each column along w index so we can pick which features to use
# for idx, col in enumerate(df.columns):
#     print(idx, col)
# don't need after we add usecols=[] to pd.read_csv, but might want to go back to use diff features

# print datatypes of each column
#print(df.dtypes)
print('Features:\n',list(df.columns))

# choosing features: notes to self
# for target types and nationalities, only going to use the first col
# only taking first col of claimed, but also using compclaim (boolean, competing claims from two groups)
# weaptype and weapsubtype, only using first col


# now we need to deal with NA values, which exist in the float64 dtype columns
# 9:unknown for attack types
df.attacktype2 = df.attacktype2.fillna(9).astype(int)
df.attacktype3 = df.attacktype3.fillna(9).astype(int)

df.natlty1 = df.natlty1.fillna(-9).astype(int)

# changing NaN to -9 
df.nperps = df.nperps.fillna(-99).astype(int)
df.claimed, df.competingclaims = df.claimed.fillna(0).astype(int), df.competingclaims.fillna(-9).astype(int)
# 13:unknown for weapon types
df.weaptype1, df.weapsubtype1 = df.weaptype1.fillna(13).astype(int), df.weapsubtype1.fillna(-9).astype(int)
df.nkill, df.hostages = df.nkill.fillna(-9).astype(int), df.hostages.fillna(-9).astype(int)


(170350, 22) 

Features:
 ['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', 'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3', 'targtype1', 'natlty1', 'groupname', 'nperps', 'claimed', 'competingclaims', 'weaptype1', 'weapsubtype1', 'nkill', 'hostages']


In [2]:
print('This dataset contains terrorist attacks from', df.year.min(), 'to', df.year.max()) 

This dataset contains terrorist attacks from 1970 to 2016


In [3]:
# Now let's count the number of recorded attacks by each group
# we will probably only want to consider groups with > n attacks
counts = df[['region','groupname']].groupby(['region','groupname']).size().reset_index(name='count') \
                            .sort_values(['count'],ascending=False)

print('number of terrorist groups in the database: ', len(counts.groupname.unique()))

# only include groups that are responsibile for 5+ attacks
counts = counts[counts['count'] > 4]

print('number of groups we are considering: ', len(counts.groupname.unique()))

# then alter dataframe to only include rows in which groupname appears 5+ times
df = df[df.groupname.isin(counts.groupname)]
print(df.shape)

# dict of each region name corresponding with its number
regdict = {'1':'North America', '2':'Central America & Caribbean', '3':'South America', '4':'East Asia',
          '5':'Southeast Asia', '6':'South Asia', '7':'Central Asia', '8':'Western Europe', '9':'Eastern Europe',
          '10':'Middle East & North Africa', '11':'Sub-Saharan Africa', '12':'Australia & Oceania'}

# let's divide the dataset into geographical regions 

# separate each region into its own dataframe, stored in dictionary
# DFs - main dataframe divided into regions
# countsbyreg - frequency of groups in each region
DFs = {}
countsbyreg = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs[name] = df[df.region==reg]
    countsbyreg[name] = counts[counts.region==reg]

# # let's try only using attacks with labeled groupnames, excluding 'Unknown' groups

DFs_knowngnames = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs_knowngnames[name] = known_df[known_df.region==reg]


number of terrorist groups in the database:  3454
number of groups we are considering:  870
(166204, 22)


In [6]:
# to get an idea of the top 5 frequently occuring groupnames in each region
# and the number of groups with over 5 attacks in each region 
for key in countsbyreg:
    print(key,'\n\n',len(countsbyreg[key]),' groups with 5+ attacks', '\n', 
          len(DFs[key]),' incidents', '\n\n', 
          round(float(countsbyreg[key]['count'][countsbyreg[key].groupname=='Unknown'])/float(len(DFs[key]))*100, 2),
          '% unknown groupnames', '\n\n', countsbyreg[key].head(),'\n')
    print('-'*100)

Middle East & North Africa 

 170  groups with 5+ attacks 
 45766  incidents 

 60.2 % unknown groupnames 

       region                                    groupname  count
3484      10                                      Unknown  27550
3102      10  Islamic State of Iraq and the Levant (ISIL)   4260
3184      10               Kurdistan Workers' Party (PKK)   1980
3306      10                                 Palestinians   1104
2854      10     Al-Qaida in the Arabian Peninsula (AQAP)    972 

----------------------------------------------------------------------------------------------------
South Asia 

 195  groups with 5+ attacks 
 40927  incidents 

 50.11 % unknown groupnames 

       region                                       groupname  count
1719       6                                         Unknown  20508
1642       6                                         Taliban   6574
1292       6  Communist Party of India - Maoist (CPI-Maoist)   1766
1459       6         Liberation 

In [4]:
# 'Unknown' is the most common groupname in every region
print("Total percentage of attacks by unknown terrorist groups: ", 
      round(float(df.groupname[df.groupname=='Unknown'].count() / df.groupname.count())*100,3))

# first off, let's see if we can classify unknown vs. known attacks
# in order to do so, add a new boolean column to the dataframe
df['is_unknown'] = np.where(df.groupname == 'Unknown', True, False)

# separate the unlabeled attacks from the labeled ones; can use known attacks to check accuracy of our models but 
# ultimately the goal is to predict which groups are responsible for the unknown attacks
unknown_df = df[df.groupname == 'Unknown']

known_df = df[df.groupname != 'Unknown']

print(known_df.shape)

Total percentage of attacks by unknown terrorist groups:  47.114
(87898, 23)


In [89]:
# now onto sklearn!
from time import time
from operator import itemgetter
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# suppress warnings
import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)




# create a df for accuracy of our algorithms in each region
results = pd.DataFrame(columns=['Region', 'F1'])

# and a dictionary to compare different algorithms, holding a df of accuracies for each one   
classifier_scores = {}
classifier_scores_lbld = {}


# so that we only are classifying between the 5 main groups, or else 'other'

# ok... also... instead of getting rid of the Unknown groupnames, let's try classifying them as 'other'

# for key in countsbyreg:
#     DFs[key].loc[:, 'gname'] = DFs[key].loc[:,'groupname'] \
#                 .apply(lambda name: top_or_other(name, top_groups[key]))


def top_or_other(gname, toplist):
    if gname in toplist:
        return gname
    else:
        return 'Other'

top_groups = {}
for key in countsbyreg:
    top_groups[key] = list(countsbyreg[key]['groupname'][1:6])
    DFs_knowngnames[key].loc[:, 'gname'] = DFs_knowngnames[key].loc[:,'groupname'] \
                .apply(lambda name: top_or_other(name, top_groups[key]))
    DFs[key].loc[:, 'gname'] = DFs[key].loc[:,'groupname'] \
                .apply(lambda name: top_or_other(name, top_groups[key]))
    

# so we're creating two sets of training/testing data - 
# one that excluded Unknown groups all together, 
# and another that classifies Unknown as 'other'

training_X = {}
training_y = {}
testing_X = {}
testing_y = {}

# classifying Unknown labels as 'other'
for key in DFs:
    training_X[key], testing_X[key], training_y[key], testing_y[key] = train_test_split(DFs[key]. \
                                                                                    drop(['groupname','gname','region'], axis=1),
                                                                                    DFs[key].gname,
                                                                                    test_size=0.3, random_state=7)

# excluding Unknown labels
training_X_known = {}
training_y_known = {}
testing_X_known = {}
testing_y_known = {}
   
for key in DFs_knowngnames:
    training_X_known[key], testing_X_known[key], training_y_known[key], testing_y_known[key] = train_test_split(DFs_knowngnames[key]. \
                                                                                    drop(['groupname','gname','region'], axis=1),
                                                                                    DFs_knowngnames[key].gname,
                                                                                    test_size=0.3, random_state=7)



In [93]:
print(training_X['Australia & Oceania'].columns)

Index(['year', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', 'success',
       'suicide', 'attacktype1', 'attacktype2', 'attacktype3', 'targtype1',
       'natlty1', 'nperps', 'claimed', 'competingclaims', 'weaptype1',
       'weapsubtype1', 'nkill', 'hostages'],
      dtype='object')


In [99]:
# trying out a few different classification algorithms to see which performs the best
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=20),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]


# for name, clf in zip(names, classifiers):
#     for key, n in zip(training_y, np.arange(len(training_y))):
#         clf.fit(training_X[key], training_y[key])
#         results.loc[n, 'Region'] = key
#         results.loc[n, 'F1'] = round(metrics.f1_score(testing_y[key], clf.predict(testing_X[key]), average='weighted'), 4)
#     classifier_scores[name] = results.copy()

# %store classifier_scores

%store -r classifier_scores

print("F1 SCORES WITH UNKNOWN LABELS CLASSIFIED AS 'OTHER' IN TRAINING DATA\n\n")
for alg in classifier_scores:
    print(alg, '\n', classifier_scores[alg].sort_values('F1', ascending=False), '\n')

F1 SCORES WITH UNKNOWN LABELS CLASSIFIED AS 'OTHER' IN TRAINING DATA


Nearest Neighbors 
                          Region      F1
11          Australia & Oceania  0.9377
10                 Central Asia  0.8911
9                     East Asia  0.8835
0    Middle East & North Africa  0.8793
6   Central America & Caribbean  0.8739
8                 North America  0.8343
3            Sub-Saharan Africa  0.8273
5                Western Europe  0.8221
1                    South Asia  0.8104
2                Southeast Asia  0.7633
7                Eastern Europe  0.7631
4                 South America  0.7383 

Decision Tree 
                          Region      F1
11          Australia & Oceania  0.9372
10                 Central Asia  0.9062
6   Central America & Caribbean  0.8766
0    Middle East & North Africa    0.86
8                 North America  0.8305
9                     East Asia  0.8161
3            Sub-Saharan Africa  0.8022
1                    South Asia  0.7769
5          

In [97]:
# for name, clf in zip(names, classifiers):
#     print(name)
#     for key, n in zip(training_y, np.arange(len(training_y))):
#         clf.fit(training_X[key], training_y[key])
#         results.loc[n, 'Region'] = key
#         results.loc[n, 'F1'] = round(metrics.accuracy_score(testing_y[key], clf.predict(testing_X[key])), 4)
#     print(results.sort_values('F1', ascending=False))
#     classifier_scores_lbld[name] = results.copy()

# %store classifier_scores_lbld

%store -r classifier_scores_lbld

print("F1 SCORES WITH UNKNOWN LABELS REMOVED FROM TRAINING DATA\n")
for alg in classifier_scores_lbld:
    print(alg, '\n', classifier_scores_lbld[alg].sort_values('F1', ascending=False), '\n')

F1 SCORES WITH UNKNOWN LABELS REMOVED FROM TRAINING DATA


Nearest Neighbors 
                          Region      F1
11          Australia & Oceania  0.9429
10                 Central Asia  0.9067
9                     East Asia  0.8843
0    Middle East & North Africa  0.8811
6   Central America & Caribbean  0.8742
8                 North America   0.835
5                Western Europe  0.8265
3            Sub-Saharan Africa  0.8263
1                    South Asia  0.8153
7                Eastern Europe  0.7756
2                Southeast Asia  0.7635
4                 South America  0.7412 

Decision Tree 
                          Region      F1
11          Australia & Oceania  0.9286
10                 Central Asia    0.92
6   Central America & Caribbean  0.8755
9                     East Asia  0.8657
0    Middle East & North Africa  0.8629
8                 North America  0.8623
1                    South Asia  0.8124
3            Sub-Saharan Africa   0.809
5                Wester

In [103]:
# EDIT: ADDED LATER
# to test out the different algorithms after reducing the dataset to only 5 features
# (['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1'])
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=5),
    MLPClassifier(alpha=1)]
# needed to change max_features for RF

print("F1 SCORES USING ONLY 6 FEATURES IN TRAINING DATA\n")

for name, clf in zip(names, classifiers):
    print('\n',name)
    for key, n in zip(training_y, np.arange(len(training_y))):
        clf.fit(training_X[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1']], training_y[key])
        results.loc[n, 'Region'] = key
        results.loc[n, 'F1'] = round(metrics.f1_score(testing_y[key], clf.predict(testing_X[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1']]), 
                                                      average='weighted'), 4)
    print(results.sort_values('F1', ascending=False).copy())

# random forest still performs the best

F1 SCORES USING ONLY 6 FEATURES IN TRAINING DATA



 Nearest Neighbors
                         Region      F1
10                 Central Asia  0.9202
11          Australia & Oceania  0.8932
6   Central America & Caribbean  0.8576
0    Middle East & North Africa   0.836
8                 North America  0.8351
3            Sub-Saharan Africa  0.8331
9                     East Asia   0.829
5                Western Europe  0.7987
7                Eastern Europe  0.7674
1                    South Asia  0.7649
2                Southeast Asia  0.7554
4                 South America  0.6935

 Decision Tree
                         Region      F1
10                 Central Asia  0.9181
11          Australia & Oceania  0.9069
6   Central America & Caribbean  0.8705
8                 North America  0.8223
0    Middle East & North Africa  0.8108
3            Sub-Saharan Africa  0.8025
9                     East Asia  0.7992
5                Western Europe  0.7434
7                Eastern Europe  

In [109]:
# random forest seems to give the best results, so let's explore it further!
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=78, max_features=5, min_samples_split=10)


# let's do a gridsearch to try and optimize our parameters; first make a dict of params to try
param_grid = {"max_depth": [3, None],
              "max_features": [1, 5, 20],
              "min_samples_split": [3, 10],
              "min_samples_leaf": [1, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# from sklearn docs:
def report(grid_scores, n_top=5):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print(" ")

# split up training and testing sets: (just classifying known vs unknown here for the purpose of tuning parameters)
X_train, X_test, y_train_bool, y_test_bool = train_test_split(df.drop(['groupname', 'is_unknown'], axis=1), df.is_unknown, 
                                                    test_size=0.3, random_state=7)

# these training/testing dataframes are a hodgepodge of all the regions to save time on the grid search

# commenting this out bc it takes a really long time
# grid_search = GridSearchCV(clf, param_grid=param_grid)
# start = time()

# grid_search.fit(X_train, y_train_bool)

# print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
#       % (time() - start, len(grid_search.grid_scores_)))

# grid_scores = grid_search.grid_scores_

# %store grid_scores

%store -r grid_scores

report(grid_scores)

# 0.847 is the max. accuracy we can get with random forest to classify labeled vs unlabeled attacks

Model with rank: 1
Mean validation score: 0.847 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 2
Mean validation score: 0.846 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 3
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 4
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 5
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_

In [117]:

clf = RandomForestClassifier(n_estimators=200, max_features=20, min_samples_split=10, random_state=666)

for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    clf.fit(training_X_known[key], training_y_known[key])
    #print(training_y[key].unique())
    results.loc[n, 'Region'] = key
    results.loc[n, 'F1'] = round(metrics.f1_score(testing_y_known[key], clf.predict(testing_X_known[key]),
                                                       average='weighted'), 4)


print('Random Forest F1 Scores - Unknown labels removed\n')
print(results.sort_values('F1', ascending=False))

print('\nMean F1 Score using Random Forest for all regions(exluding Central Asia): ', 
      round(results.sort_values('F1', ascending=False).F1[:-1].mean(), 4))

Random Forest F1 Scores - Unknown labels removed

                         Region      F1
11          Australia & Oceania  0.9464
3            Sub-Saharan Africa  0.9312
6   Central America & Caribbean  0.9102
0    Middle East & North Africa  0.9081
1                    South Asia  0.8689
9                     East Asia  0.8457
8                 North America  0.8418
5                Western Europe  0.8015
2                Southeast Asia  0.7921
4                 South America  0.7865
7                Eastern Europe  0.7264
10                 Central Asia  0.3722

Mean F1 Score using Random Forest for all regions(exluding Central Asia):  0.8508


In [116]:
print(len(training_X['Australia & Oceania'].columns))

20


In [118]:

    
rfclf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_split=10, random_state=666)

#print(pd.Series(training_y['Australia & Oceania']).unique())
for key, n in zip(training_y, np.arange(len(training_y))):
    rfclf.fit(training_X[key], training_y[key])
    #print(training_y[key].unique())
    results.loc[n, 'Region'] = key
    results.loc[n, 'F1'] = round(metrics.f1_score(testing_y[key], rfclf.predict(testing_X[key]), average='weighted'), 4)

print("Random Forest F1 Scores - Unknown labels as 'other'\n")

print(results.sort_values('F1', ascending=False))


# so looks like that's a bad method because, for example, Central Asia gets a really high score (second best)
# but we know that most of the data there is missing

Random Forest F1 Scores - Unknown labels as 'other'

                         Region      F1
11          Australia & Oceania  0.9522
10                 Central Asia  0.9207
0    Middle East & North Africa   0.906
6   Central America & Caribbean  0.9044
9                     East Asia  0.8978
3            Sub-Saharan Africa   0.869
8                 North America  0.8681
1                    South Asia  0.8554
5                Western Europe  0.8491
2                Southeast Asia  0.8064
7                Eastern Europe  0.8006
4                 South America  0.7818


In [119]:
# OK now let's get the classification_report for each reg

for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    clf.fit(training_X_known[key], training_y_known[key])
    #print(training_y[key].unique())
    print(key, '\n')
    print(metrics.classification_report(testing_y_known[key], clf.predict(testing_X_known[key]),
                                                        labels=top_groups[key].append('Other')))
    print('-'*100)




# checking why Kanak Socialist (...) wasn't in the predictions at all

#print(pd.Series(testing_y['Australia & Oceania']).unique())
#testdf = pd.DataFrame(testing_y['Australia & Oceania'])

# ah.. because it was only the true label once
# print(len(testdf[testdf.gname=='Kanak Socialist National Liberation Front']))
# print(len(training_X['Australia & Oceania']))


Middle East & North Africa 

                                             precision    recall  f1-score   support

   Al-Qaida in the Arabian Peninsula (AQAP)       0.73      0.82      0.78       279
            Houthi extremists (Ansar Allah)       0.86      0.82      0.84       261
Islamic State of Iraq and the Levant (ISIL)       0.94      0.93      0.93      1318
             Kurdistan Workers' Party (PKK)       0.93      0.94      0.93       602
                                      Other       0.92      0.92      0.92      2680
                               Palestinians       0.88      0.76      0.82       325

                                avg / total       0.91      0.91      0.91      5465

----------------------------------------------------------------------------------------------------
South Asia 

                                                precision    recall  f1-score   support

Communist Party of India - Maoist (CPI-Maoist)       0.77      0.81      0.79       5

In [72]:
###### testing #########

clf.fit(training_X['Australia & Oceania'], training_y['Australia & Oceania'])

mypreds = clf.predict(testing_X['Australia & Oceania'])

# print('F1: ', metrics.f1_score(testing_y['Australia & Oceania'], mypreds, average='weighted', 
#                                labels=['Other',
#                                     'Bougainville Revolutionary Army (BRA)',
#                                     'Kanak Socialist National Liberation Front']))

rprt = metrics.f1_score(testing_y['Australia & Oceania'], mypreds, average='weighted') 
#                                labels=['Other',
#                                     'Bougainville Revolutionary Army (BRA)',
#                                     'Kanak Socialist National Liberation Front'])

print(rprt)

0.952212223472


In [132]:
print(known_df.columns)

# concatenate the lists of the top names in each region so we can run algorithm on whole dataset:
tops = []
for group in top_groups:
    for g in top_groups[group][:3]:
        tops.append(g)

len(tops)

Index(['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple',
       'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3',
       'targtype1', 'natlty1', 'groupname', 'nperps', 'claimed',
       'competingclaims', 'weaptype1', 'weapsubtype1', 'nkill', 'hostages',
       'is_unknown'],
      dtype='object')


36

In [139]:
# OK, next let's do some feature selection! 
from collections import defaultdict

weights = defaultdict(list)


  
known_df.loc[:,'gname'] = known_df.loc[:,'groupname'].apply(lambda name: top_or_other(name, tops))


# splitting up the whole dataframe into training/testing sets, 
# using only the labeled points - i.e. only predict groups where we can test our accuracy
# our goal is to predict which groups are responsible for the unlabeled attacks
X_train_labld, X_test_labld, y_train_labld, y_test_labld = train_test_split(known_df.drop(['groupname', 'is_unknown',
                                                                                          'gname'],
                                                                            axis=1), known_df.gname, 
                                                                            test_size=0.3, random_state=7)

print(X_train_labld.columns)
#print(known_df.columns)
newclf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_split=10, random_state=666)

newclf.fit(X_train_labld, y_train_labld)

labld_preds = newclf.predict(X_test_labld)

print('Random Forest f1 score predicting groupname on whole dataset (only known groups) using region as a feature: ', 
      round(metrics.f1_score(y_test_labld, labld_preds, average='weighted'), 4))

print(metrics.classification_report(y_test_labld, labld_preds,  labels=tops.append('Other')))
# # let's see how significant each feature is (going to need to get rid of some)
# for featr, wght in zip(list(X_train_labld.columns), newclf.feature_importances_):
#     weights[featr].append(wght)

# for feat in weights:
#     weights[feat] = sum(weights[feat])/float(len(weights))
    
# print('\nNumber of features in full dataset: ', len(X_train_labld.columns))
# weights = pd.DataFrame(list(weights.items()), columns=['feature', 'weight'])
# print('\n',weights.sort_values('weight', ascending=False))

Index(['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple',
       'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3',
       'targtype1', 'natlty1', 'nperps', 'claimed', 'competingclaims',
       'weaptype1', 'weapsubtype1', 'nkill', 'hostages'],
      dtype='object')
Random Forest f1 score predicting groupname on whole dataset (only known groups) using region as a feature:  0.893
                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.60      0.43      0.50       134
        African National Congress (South Africa)       0.94      0.87      0.91       199
                                      Al-Shabaab       0.98      0.97      0.98       797
                        Anti-Abortion extremists       0.98      0.97      0.98        67
                           Azerbaijan Guerrillas       0.00      0.00      0.00         4
             Basque Fatherland and Freedom

In [140]:
# need to trim down features, obviously
# so let's use SelectFromModel to identify which features are most important
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(newclf)

sfm.fit(X_train_labld, y_train_labld)

trimmedcols = list(X_train_labld.columns[sfm.get_support(indices=True)])

trimmed_X = pd.DataFrame(sfm.transform(X_train_labld), columns=trimmedcols)
trimmed_test_X = pd.DataFrame(sfm.transform(X_test_labld), columns=trimmedcols)

# now making a new clf to fit smaller dataframes
rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, min_samples_leaf=3, min_samples_split=3,
                          random_state=10, n_jobs=-1)

rf_clf.fit(trimmed_X, y_train_labld)

print('Selected features for trimmed dataset: ', trimmedcols)

for featr in zip(list(trimmed_X.columns), rf_clf.feature_importances_):
    print(featr)
    
# nationality of victims has importance of almost 50% ! 

Selected features for trimmed dataset:  ['year', 'region', 'natlty1']
('year', 0.17177668338135857)
('region', 0.24798534307027884)
('natlty1', 0.58023797354836237)


In [148]:
# let's see how the model performs on the trimmed data
newpreds = rf_clf.predict(trimmed_test_X)

print('Random Forest on whole dataset with trimmed features: \n', metrics.classification_report(y_test_labld, newpreds, 
                                                                                   labels=tops.append('Other')))

# f1 only decreased by ~4%, but using 3 features instead of 22! 

Random Forest on whole dataset with trimmed features: 
                                                   precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.42      0.07      0.13       134
        African National Congress (South Africa)       0.94      0.89      0.92       199
                                      Al-Shabaab       0.92      0.97      0.94       797
                        Anti-Abortion extremists       0.62      0.12      0.20        67
                           Azerbaijan Guerrillas       0.00      0.00      0.00         4
             Basque Fatherland and Freedom (ETA)       0.82      0.89      0.85       656
                                      Boko Haram       0.87      0.91      0.89       598
           Bougainville Revolutionary Army (BRA)       0.80      1.00      0.89         8
                                  Chechen Rebels       0.96      0.94      0.95        90
                 Chukakuha (Middle Core Fac

In [153]:
from sklearn.feature_selection import SelectFromModel

# now let's select important features for each region
clf_regions = RandomForestClassifier(n_estimators=200, random_state=15)

newsfm = SelectFromModel(clf_regions)

results_byregion = pd.DataFrame(columns=['Region', 'F1 Score', 'Num Features'])

for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    newsfm.fit(training_X_known[key], training_y_known[key])
    cols = list(training_X_known[key].columns[newsfm.get_support(indices=True)])
    clf = RandomForestClassifier(n_estimators=50, random_state=15, n_jobs=-1)
    Xtrain = pd.DataFrame(newsfm.transform(training_X_known[key]), columns=cols)
    Xtest = pd.DataFrame(newsfm.transform(testing_X_known[key]), columns=cols)
    clf.fit(Xtrain, training_y_known[key])
    predictions = clf.predict(Xtest)
    results_byregion.loc[n, 'Region'] = key
    results_byregion.loc[n, 'F1 Score'] = round(metrics.f1_score(testing_y_known[key], predictions, 
                                                                 average='weighted'), 4)
    results_byregion.loc[n, 'Num Features'] = int(len(cols))
    print('-'*100,'\n',key)
    print(metrics.classification_report(testing_y_known[key], predictions, labels=top_groups[key]))

print('-'*100,'\n','Summary of all regions')
print(results_byregion.sort_values('F1 Score', ascending=False))

---------------------------------------------------------------------------------------------------- 
 Middle East & North Africa
                                             precision    recall  f1-score   support

Islamic State of Iraq and the Levant (ISIL)       0.93      0.93      0.93      1318
             Kurdistan Workers' Party (PKK)       0.89      0.92      0.91       602
                               Palestinians       0.81      0.74      0.78       325
   Al-Qaida in the Arabian Peninsula (AQAP)       0.72      0.79      0.76       279
            Houthi extremists (Ansar Allah)       0.81      0.81      0.81       261
                                      Other       0.91      0.90      0.90      2680

                                avg / total       0.89      0.89      0.89      5465

---------------------------------------------------------------------------------------------------- 
 South Asia
                                                precision    recall  f1-s

In [29]:
# how could we improve this further? what are the limitations?

# it was relatively arbitrary that I decided to only consider attacks; could try altering this 
# to see effects on accuracy
# could try: keep all incidents in the data, but groups with < n attacks go into an 'Other' label

# why is the accuracy in some regions so low? for central/east Asia and Australia, much less data 
# and a very high % of unknown attacks

# the goal is to predict 'Unknown' attacks, but can't check accuracy on the unlabeled attacks 