In [1]:
# Coding challenge for fellowship.AI

# Molly Gibson - September 12, 2017

# data from http://www.start.umd.edu/gtd/contact/ - full GTD dataset
# data downloaded in .xlsx format, so I converted to familiar csv:
# $ pip install csvkit
# $ in2csv datasets/globalterrorismdb_0617dist.xlsx > globalterrorism_db.csv

# Goal of the challenge: use attack type, weapons used, description of the attack, etc. to build a model that
# can predict what group may have been responsible for an incident

# first, install basic dependencies: 
import pandas as pd
import numpy as np

# then upload the GTD data
df = pd.read_csv('datasets/globalterrorism_db.csv', encoding='ISO-8859-1', usecols=[1,9,19,20,21,22,25,26,27,28,30,
                                                                                  32,34,40,58,69,71,80,81,83,98,109])

print(df.shape, '\n')

# rename a few columns for readability:
df = df.rename(columns={'iyear':'year', 'gname':'groupname','compclaim':'competingclaims', 'ishostkid':'hostages'})


# get the names of each column along w index so we can pick which features to use
# for idx, col in enumerate(df.columns):
#     print(idx, col)
# don't need after we add usecols=[] to pd.read_csv, but might want to go back to use diff features

# print datatypes of each column
#print(df.dtypes)
print('Features:\n',list(df.columns))

# choosing features: notes to self
# for target types and nationalities, only going to use the first col
# only taking first col of claimed, but also using compclaim (boolean, competing claims from two groups)
# weaptype and weapsubtype, only using first col


# now we need to deal with NA values, which exist in the float64 dtype columns
# 9:unknown for attack types
df.attacktype2 = df.attacktype2.fillna(9).astype(int)
df.attacktype3 = df.attacktype3.fillna(9).astype(int)

df.natlty1 = df.natlty1.fillna(-9).astype(int)

# changing NaN to -9 
df.nperps = df.nperps.fillna(-99).astype(int)
df.claimed, df.competingclaims = df.claimed.fillna(0).astype(int), df.competingclaims.fillna(-9).astype(int)
# 13:unknown for weapon types
df.weaptype1, df.weapsubtype1 = df.weaptype1.fillna(13).astype(int), df.weapsubtype1.fillna(-9).astype(int)
df.nkill, df.hostages = df.nkill.fillna(-9).astype(int), df.hostages.fillna(-9).astype(int)


(170350, 22) 

Features:
 ['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', 'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3', 'targtype1', 'natlty1', 'groupname', 'nperps', 'claimed', 'competingclaims', 'weaptype1', 'weapsubtype1', 'nkill', 'hostages']


In [2]:
print('This dataset contains terrorist attacks from', df.year.min(), 'to', df.year.max()) 

This dataset contains terrorist attacks from 1970 to 2016


In [26]:
# Now let's count the number of recorded attacks by each group
# we will probably only want to consider groups with > n attacks
counts = df[['region','groupname']].groupby(['region','groupname']).size().reset_index(name='count') \
                            .sort_values(['count'],ascending=False)

print('number of terrorist groups in the database: ', len(counts.groupname.unique()))

# only include groups that are responsibile for 3+ attacks
counts = counts[counts['count'] > 2]


# then alter dataframe to only include rows in which groupname appears 5+ times
df = df[df.groupname.isin(counts.groupname)]
print('number of terrorist groups we are considering: ', len(counts.groupname.unique()))

# print(df.shape)

# dict of each region name corresponding with its number
regdict = {'1':'North America', '2':'Central America & Caribbean', '3':'South America', '4':'East Asia',
          '5':'Southeast Asia', '6':'South Asia', '7':'Central Asia', '8':'Western Europe', '9':'Eastern Europe',
          '10':'Middle East & North Africa', '11':'Sub-Saharan Africa', '12':'Australia & Oceania'}




number of terrorist groups in the database:  3454
number of terrorist groups we are considering:  1270


In [29]:
# 'Unknown' is the most common groupname in every region
print("Total percentage of attacks by unknown terrorist groups: ", 
      round(float(df.groupname[df.groupname=='Unknown'].count() / df.groupname.count())*100,3))

# first off, let's see if we can classify unknown vs. known attacks
# in order to do so, add a new boolean column to the dataframe
df['is_unknown'] = np.where(df.groupname == 'Unknown', True, False)

# separate the unlabeled attacks from the labeled ones; can use known attacks to check accuracy of our models but 
# ultimately the goal is to predict which groups are responsible for the unknown attacks
unknown_df = df[df.groupname == 'Unknown']

known_df = df[df.groupname != 'Unknown']

print(known_df.shape)

# let's divide the dataset into geographical regions 

# separate each region into its own dataframe, stored in dictionary
# DFs - main dataframe divided into regions
# countsbyreg - frequency of groups in each region
DFs = {}
countsbyreg = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs[name] = df[df.region==reg]
    countsbyreg[name] = counts[counts.region==reg]

# # let's try only using attacks with labeled groupnames, excluding 'Unknown' groups

DFs_knowngnames = {}
for reg in counts.region:
    name = regdict[str(reg)]
    DFs_knowngnames[name] = known_df[known_df.region==reg]

Total percentage of attacks by unknown terrorist groups:  46.712
(89328, 23)


In [30]:
# to get an idea of the top 5 frequently occuring groupnames in each region
# and the number of groups with over 5 attacks in each region 
for key in countsbyreg:
    print(key,'\n\n',len(countsbyreg[key]),' groups', '\n', 
          len(DFs[key]),' incidents', '\n\n', 
          round(float(countsbyreg[key]['count'][countsbyreg[key].groupname=='Unknown'])/float(len(DFs[key]))*100, 2),
          '% unknown groupnames', '\n\n', countsbyreg[key].head(),'\n')
    print('-'*100)

Middle East & North Africa 

 249  groups 
 46005  incidents 

 59.88 % unknown groupnames 

       region                                    groupname  count
3484      10                                      Unknown  27550
3102      10  Islamic State of Iraq and the Levant (ISIL)   4260
3184      10               Kurdistan Workers' Party (PKK)   1980
3306      10                                 Palestinians   1104
2854      10     Al-Qaida in the Arabian Peninsula (AQAP)    972 

----------------------------------------------------------------------------------------------------
South Asia 

 266  groups 
 41148  incidents 

 49.84 % unknown groupnames 

       region                                       groupname  count
1719       6                                         Unknown  20508
1642       6                                         Taliban   6574
1292       6  Communist Party of India - Maoist (CPI-Maoist)   1766
1459       6         Liberation Tigers of Tamil Eelam (LTTE)   

In [31]:
# now onto sklearn!
from time import time
from operator import itemgetter
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# suppress warnings
import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning)




# create a df for accuracy of our algorithms in each region
results = pd.DataFrame(columns=['Region', 'F1'])

# and a dictionary to compare different algorithms, holding a df of accuracies for each one   
classifier_scores = {}
classifier_scores_lbld = {}


# so that we only are classifying between the 5 main groups, or else 'other'

# ok... also... instead of getting rid of the Unknown groupnames, let's try classifying them as 'other'


def top_or_other(gname, toplist):
    if gname in toplist:
        return gname
    else:
        return 'Other'

top_groups = {}
for key in countsbyreg:
    top_groups[key] = list(countsbyreg[key]['groupname'][1:6])
    # unknown labels removed
    DFs_knowngnames[key].loc[:, 'gname'] = DFs_knowngnames[key].loc[:,'groupname'] \
                .apply(lambda name: top_or_other(name, top_groups[key]))
    # unknown labels classified as 'other'
    DFs[key].loc[:, 'gname'] = DFs[key].loc[:,'groupname'] \
                .apply(lambda name: top_or_other(name, top_groups[key]))
    

# so we're creating two sets of training/testing data - 
# one that excluded Unknown groups all together, 
# and another that classifies Unknown as 'other'

training_X = {}
training_y = {}
testing_X = {}
testing_y = {}

# classifying Unknown labels as 'other'
for key in DFs:
    training_X[key], testing_X[key], training_y[key], testing_y[key] = train_test_split(DFs[key]. \
                                                                                    drop(['groupname','gname','region'], axis=1),
                                                                                    DFs[key].gname,
                                                                                    test_size=0.3, random_state=7)

# excluding Unknown labels
training_X_known = {}
training_y_known = {}
testing_X_known = {}
testing_y_known = {}
   
for key in DFs_knowngnames:
    training_X_known[key], testing_X_known[key], training_y_known[key], testing_y_known[key] = train_test_split(DFs_knowngnames[key]. \
                                                                                    drop(['groupname','gname','region'], axis=1),
                                                                                    DFs_knowngnames[key].gname,
                                                                                    test_size=0.3, random_state=7)



In [32]:
# trying out a few different classification algorithms to see which performs the best
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=20),
    MLPClassifier(alpha=1),
    AdaBoostClassifier()]


for name, clf in zip(names, classifiers):
    for key, n in zip(training_y, np.arange(len(training_y))):
        clf.fit(training_X[key], training_y[key])
        results.loc[n, 'Region'] = key
        results.loc[n, 'F1'] = round(metrics.f1_score(testing_y[key], clf.predict(testing_X[key]), average='weighted'), 4)
    classifier_scores[name] = results.copy()

# so that we don't have to keep running it 
%store classifier_scores

#%store -r classifier_scores

print("F1 SCORES WITH UNKNOWN LABELS CLASSIFIED AS 'OTHER' IN TRAINING DATA\n\n")
for alg in classifier_scores:
    print(alg, '\n', classifier_scores[alg].sort_values('F1', ascending=False), '\n')

Stored 'classifier_scores' (dict)
F1 SCORES WITH UNKNOWN LABELS CLASSIFIED AS 'OTHER' IN TRAINING DATA


Nearest Neighbors 
                          Region      F1
0    Middle East & North Africa  0.9179
6   Central America & Caribbean  0.9034
10                 Central Asia  0.8943
11          Australia & Oceania  0.8829
1                    South Asia  0.8719
3            Sub-Saharan Africa  0.8661
8                 North America  0.8552
7                Eastern Europe  0.8377
5                Western Europe   0.822
9                     East Asia  0.8077
2                Southeast Asia  0.8026
4                 South America  0.7791 

Decision Tree 
                          Region      F1
11          Australia & Oceania  0.9245
0    Middle East & North Africa  0.9225
10                 Central Asia  0.9212
6   Central America & Caribbean  0.9149
7                Eastern Europe  0.9102
3            Sub-Saharan Africa  0.8814
9                     East Asia  0.8756
1                

In [34]:
# for name, clf in zip(names, classifiers):
#     print(name)
#     for key, n in zip(training_y_known, np.arange(len(training_y_known))):
#         clf.fit(training_X_known[key], training_y_known[key])
#         results.loc[n, 'Region'] = key
#         results.loc[n, 'F1'] = round(metrics.accuracy_score(testing_y_known[key], clf.predict(testing_X_known[key])), 4)
#     print(results.sort_values('F1', ascending=False))
#     classifier_scores_lbld[name] = results.copy()

# %store classifier_scores_lbld

%store -r classifier_scores_lbld

print("F1 SCORES WITH UNKNOWN LABELS REMOVED FROM TRAINING DATA\n")
for alg in classifier_scores_lbld:
    print(alg, '\n', classifier_scores_lbld[alg].sort_values('F1', ascending=False), '\n')

F1 SCORES WITH UNKNOWN LABELS REMOVED FROM TRAINING DATA

Nearest Neighbors 
                          Region      F1
3            Sub-Saharan Africa  0.9038
6   Central America & Caribbean  0.8784
0    Middle East & North Africa  0.8644
1                    South Asia  0.8346
8                 North America  0.7997
5                Western Europe  0.7944
4                 South America  0.7434
7                Eastern Europe  0.7212
2                Southeast Asia  0.7162
9                     East Asia   0.716
11          Australia & Oceania  0.6957
10                 Central Asia  0.6538 

Decision Tree 
                          Region      F1
6   Central America & Caribbean  0.9081
3            Sub-Saharan Africa  0.8712
0    Middle East & North Africa   0.842
8                 North America  0.8055
1                    South Asia   0.785
9                     East Asia  0.7654
7                Eastern Europe  0.7631
5                Western Europe  0.7439
4                 South 

In [35]:
# EDIT: ADDED LATER
# to test out the different algorithms after reducing the dataset to only 5 features
# (['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1'])
names = ["Nearest Neighbors", "Decision Tree", "Random Forest", "Neural Net"]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5), 
    RandomForestClassifier(n_estimators=100, max_features=5),
    MLPClassifier(alpha=1)]
# needed to change max_features for RF

print("F1 SCORES USING ONLY 5 FEATURES IN TRAINING DATA\n")

for name, clf in zip(names, classifiers):
    print('\n',name)
    for key, n in zip(training_y_known, np.arange(len(training_y_known))):
        clf.fit(training_X_known[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1']], training_y_known[key])
        results.loc[n, 'Region'] = key
        results.loc[n, 'F1'] = round(metrics.f1_score(testing_y_known[key], clf.predict(testing_X_known[key][['year', 'attacktype1', 'weaptype1', 'natlty1', 'targtype1']]), 
                                                      average='weighted'), 4)
    print(results.sort_values('F1', ascending=False).copy())

# random forest still performs the best

F1 SCORES USING ONLY 5 FEATURES IN TRAINING DATA


 Nearest Neighbors
                         Region      F1
3            Sub-Saharan Africa  0.9099
6   Central America & Caribbean  0.8825
0    Middle East & North Africa  0.8637
1                    South Asia  0.8239
8                 North America  0.8068
5                Western Europe  0.7786
11          Australia & Oceania  0.7693
4                 South America  0.7288
2                Southeast Asia   0.728
7                Eastern Europe  0.7089
9                     East Asia  0.6757
10                 Central Asia   0.609

 Decision Tree
                         Region      F1
6   Central America & Caribbean  0.8954
3            Sub-Saharan Africa  0.8505
0    Middle East & North Africa  0.8413
8                 North America  0.7711
1                    South Asia  0.7462
7                Eastern Europe   0.725
5                Western Europe  0.7067
4                 South America  0.6883
2                Southeast Asia  0

In [37]:
# random forest seems to give the best results, so let's explore it further!
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=78, max_features=5, min_samples_split=10)


# let's do a gridsearch to try and optimize our parameters; first make a dict of params to try
param_grid = {"max_depth": [3, None],
              "max_features": [1, 5, 20],
              "min_samples_split": [3, 10],
              "min_samples_leaf": [1, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# from sklearn docs:
def report(grid_scores, n_top=5):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print(" ")

# split up training and testing sets: (just classifying known vs unknown here for the purpose of tuning parameters)
X_train, X_test, y_train_bool, y_test_bool = train_test_split(df.drop(['groupname', 'is_unknown'], axis=1), df.is_unknown, 
                                                    test_size=0.3, random_state=7)

# these training/testing dataframes are a hodgepodge of all the regions to save time on the grid search

grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()

grid_search.fit(X_train, y_train_bool)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))

grid_scores = grid_search.grid_scores_

%store grid_scores

#%store -r grid_scores

report(grid_scores)

# 0.847 is the max. accuracy we can get with random forest to classify labeled vs unlabeled attacks

Model with rank: 1
Mean validation score: 0.847 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 2
Mean validation score: 0.846 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 3
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 4
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
 
Model with rank: 5
Mean validation score: 0.844 (std: 0.000)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 1, 'min_

In [38]:

clf = RandomForestClassifier(n_estimators=200, max_features=20, min_samples_split=10, random_state=666)

for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    clf.fit(training_X_known[key], training_y_known[key])
    #print(training_y[key].unique())
    results.loc[n, 'Region'] = key
    results.loc[n, 'F1'] = round(metrics.f1_score(testing_y_known[key], clf.predict(testing_X_known[key]),
                                                       average='weighted'), 4)


print('Random Forest F1 Scores - Unknown labels removed\n')
print(results.sort_values('F1', ascending=False))

print('\nMean F1 Score using Random Forest for all regions(exluding Central Asia): ', 
      round(results.sort_values('F1', ascending=False).F1[:-1].mean(), 4))

Random Forest F1 Scores - Unknown labels removed

                         Region      F1
3            Sub-Saharan Africa  0.9375
6   Central America & Caribbean  0.9122
0    Middle East & North Africa  0.9026
1                    South Asia  0.8706
8                 North America  0.8507
9                     East Asia  0.8237
5                Western Europe  0.8226
4                 South America  0.7881
2                Southeast Asia  0.7841
7                Eastern Europe  0.7628
11          Australia & Oceania  0.6947
10                 Central Asia  0.6703

Mean F1 Score using Random Forest for all regions(exluding Central Asia):  0.8318


In [116]:
print(len(training_X['Australia & Oceania'].columns))

20


In [43]:

    
rfclf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_split=10, random_state=666)

#print(pd.Series(training_y['Australia & Oceania']).unique())
for key, n in zip(training_y, np.arange(len(training_y))):
    rfclf.fit(training_X[key], training_y[key])
    #print(training_y[key].unique())
    results.loc[n, 'Region'] = key
    results.loc[n, 'F1'] = round(metrics.f1_score(testing_y[key], rfclf.predict(testing_X[key]), average='weighted'), 4)

print("Random Forest F1 Scores - Unknown labels as 'other'\n")

print(results.sort_values('F1', ascending=False))

print('\nMean F1 Score using Random Forest for all regions: ', 
      round(results.sort_values('F1', ascending=False).F1.mean(), 4))

Random Forest F1 Scores - Unknown labels as 'other'

                         Region      F1
0    Middle East & North Africa  0.9618
3            Sub-Saharan Africa  0.9565
6   Central America & Caribbean  0.9468
1                    South Asia  0.9392
11          Australia & Oceania  0.9245
7                Eastern Europe  0.9219
10                 Central Asia  0.9076
9                     East Asia  0.8965
2                Southeast Asia  0.8931
8                 North America  0.8915
5                Western Europe  0.8665
4                 South America  0.8518

Mean F1 Score using Random Forest for all regions:  0.9131


In [40]:
# OK now let's get the classification_report for each reg
# unknown removed
for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    clf.fit(training_X_known[key], training_y_known[key])
    #print(training_y[key].unique())
    print(key, '\n')
    print(metrics.classification_report(testing_y_known[key], clf.predict(testing_X_known[key]),
                                                        labels=top_groups[key].append('Other')))
    print('-'*100)




# checking why Kanak Socialist (...) wasn't in the predictions at all

#print(pd.Series(testing_y['Australia & Oceania']).unique())
#testdf = pd.DataFrame(testing_y['Australia & Oceania'])

# ah.. because it was only the true label once
# print(len(testdf[testdf.gname=='Kanak Socialist National Liberation Front']))
# print(len(training_X['Australia & Oceania']))


Middle East & North Africa 

                                             precision    recall  f1-score   support

   Al-Qaida in the Arabian Peninsula (AQAP)       0.76      0.83      0.79       294
            Houthi extremists (Ansar Allah)       0.84      0.85      0.84       287
Islamic State of Iraq and the Levant (ISIL)       0.92      0.93      0.93      1284
             Kurdistan Workers' Party (PKK)       0.93      0.91      0.92       587
                                      Other       0.92      0.91      0.92      2763
                               Palestinians       0.80      0.80      0.80       322

                                avg / total       0.90      0.90      0.90      5537

----------------------------------------------------------------------------------------------------
South Asia 

                                                precision    recall  f1-score   support

Communist Party of India - Maoist (CPI-Maoist)       0.79      0.77      0.78       5

In [41]:
# unknown labeled as 'other'
for key, n in zip(training_y, np.arange(len(training_y))):
    clf.fit(training_X[key], training_y[key])
    #print(training_y[key].unique())
    print(key, '\n')
    print(metrics.classification_report(testing_y[key], clf.predict(testing_X[key]),
                                                        labels=top_groups[key].append('Other')))
    print('-'*100)




Middle East & North Africa 

                                             precision    recall  f1-score   support

   Al-Qaida in the Arabian Peninsula (AQAP)       0.71      0.83      0.77       298
            Houthi extremists (Ansar Allah)       0.86      0.82      0.84       279
Islamic State of Iraq and the Levant (ISIL)       0.93      0.94      0.93      1251
             Kurdistan Workers' Party (PKK)       0.93      0.94      0.93       609
                                      Other       0.98      0.98      0.98     11026
                               Palestinians       0.83      0.79      0.81       339

                                avg / total       0.96      0.96      0.96     13802

----------------------------------------------------------------------------------------------------
South Asia 

                                                precision    recall  f1-score   support

Communist Party of India - Maoist (CPI-Maoist)       0.79      0.81      0.80       5

In [132]:
print(known_df.columns)

# concatenate the lists of the top names in each region so we can run algorithm on whole dataset:
tops = []
for group in top_groups:
    for g in top_groups[group][:3]:
        tops.append(g)

len(tops)

Index(['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple',
       'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3',
       'targtype1', 'natlty1', 'groupname', 'nperps', 'claimed',
       'competingclaims', 'weaptype1', 'weapsubtype1', 'nkill', 'hostages',
       'is_unknown'],
      dtype='object')


36

In [139]:
# OK, next let's do some feature selection! 
from collections import defaultdict

weights = defaultdict(list)


  
known_df.loc[:,'gname'] = known_df.loc[:,'groupname'].apply(lambda name: top_or_other(name, tops))


# splitting up the whole dataframe into training/testing sets, 
# using only the labeled points - i.e. only predict groups where we can test our accuracy
# our goal is to predict which groups are responsible for the unlabeled attacks
X_train_labld, X_test_labld, y_train_labld, y_test_labld = train_test_split(known_df.drop(['groupname', 'is_unknown',
                                                                                          'gname'],
                                                                            axis=1), known_df.gname, 
                                                                            test_size=0.3, random_state=7)

print(X_train_labld.columns)
#print(known_df.columns)
newclf = RandomForestClassifier(n_estimators=100, max_features=20, min_samples_split=10, random_state=666)

newclf.fit(X_train_labld, y_train_labld)

labld_preds = newclf.predict(X_test_labld)

print('Random Forest f1 score predicting groupname on whole dataset (only known groups) using region as a feature: ', 
      round(metrics.f1_score(y_test_labld, labld_preds, average='weighted'), 4))

print(metrics.classification_report(y_test_labld, labld_preds,  labels=tops.append('Other')))
# # let's see how significant each feature is (going to need to get rid of some)
# for featr, wght in zip(list(X_train_labld.columns), newclf.feature_importances_):
#     weights[featr].append(wght)

# for feat in weights:
#     weights[feat] = sum(weights[feat])/float(len(weights))
    
# print('\nNumber of features in full dataset: ', len(X_train_labld.columns))
# weights = pd.DataFrame(list(weights.items()), columns=['feature', 'weight'])
# print('\n',weights.sort_values('weight', ascending=False))

Index(['year', 'region', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple',
       'success', 'suicide', 'attacktype1', 'attacktype2', 'attacktype3',
       'targtype1', 'natlty1', 'nperps', 'claimed', 'competingclaims',
       'weaptype1', 'weapsubtype1', 'nkill', 'hostages'],
      dtype='object')
Random Forest f1 score predicting groupname on whole dataset (only known groups) using region as a feature:  0.893
                                                  precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.60      0.43      0.50       134
        African National Congress (South Africa)       0.94      0.87      0.91       199
                                      Al-Shabaab       0.98      0.97      0.98       797
                        Anti-Abortion extremists       0.98      0.97      0.98        67
                           Azerbaijan Guerrillas       0.00      0.00      0.00         4
             Basque Fatherland and Freedom

In [140]:
# need to trim down features, obviously
# so let's use SelectFromModel to identify which features are most important
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(newclf)

sfm.fit(X_train_labld, y_train_labld)

trimmedcols = list(X_train_labld.columns[sfm.get_support(indices=True)])

trimmed_X = pd.DataFrame(sfm.transform(X_train_labld), columns=trimmedcols)
trimmed_test_X = pd.DataFrame(sfm.transform(X_test_labld), columns=trimmedcols)

# now making a new clf to fit smaller dataframes
rf_clf = RandomForestClassifier(n_estimators=100, bootstrap=True, min_samples_leaf=3, min_samples_split=3,
                          random_state=10, n_jobs=-1)

rf_clf.fit(trimmed_X, y_train_labld)

print('Selected features for trimmed dataset: ', trimmedcols)

for featr in zip(list(trimmed_X.columns), rf_clf.feature_importances_):
    print(featr)
    
# nationality of victims has importance of almost 50% ! 

Selected features for trimmed dataset:  ['year', 'region', 'natlty1']
('year', 0.17177668338135857)
('region', 0.24798534307027884)
('natlty1', 0.58023797354836237)


In [148]:
# let's see how the model performs on the trimmed data
newpreds = rf_clf.predict(trimmed_test_X)

print('Random Forest on whole dataset with trimmed features: \n', metrics.classification_report(y_test_labld, newpreds, 
                                                                                   labels=tops.append('Other')))

# f1 only decreased by ~4%, but using 3 features instead of 22! 

Random Forest on whole dataset with trimmed features: 
                                                   precision    recall  f1-score   support

                          Abu Sayyaf Group (ASG)       0.42      0.07      0.13       134
        African National Congress (South Africa)       0.94      0.89      0.92       199
                                      Al-Shabaab       0.92      0.97      0.94       797
                        Anti-Abortion extremists       0.62      0.12      0.20        67
                           Azerbaijan Guerrillas       0.00      0.00      0.00         4
             Basque Fatherland and Freedom (ETA)       0.82      0.89      0.85       656
                                      Boko Haram       0.87      0.91      0.89       598
           Bougainville Revolutionary Army (BRA)       0.80      1.00      0.89         8
                                  Chechen Rebels       0.96      0.94      0.95        90
                 Chukakuha (Middle Core Fac

In [42]:
from sklearn.feature_selection import SelectFromModel

# now let's select important features for each region
clf_regions = RandomForestClassifier(n_estimators=200, random_state=15)

newsfm = SelectFromModel(clf_regions)

results_byregion = pd.DataFrame(columns=['Region', 'F1 Score', 'Num Features'])

for key, n in zip(training_y_known, np.arange(len(training_y_known))):
    newsfm.fit(training_X_known[key], training_y_known[key])
    cols = list(training_X_known[key].columns[newsfm.get_support(indices=True)])
    clf = RandomForestClassifier(n_estimators=50, random_state=15, n_jobs=-1)
    Xtrain = pd.DataFrame(newsfm.transform(training_X_known[key]), columns=cols)
    Xtest = pd.DataFrame(newsfm.transform(testing_X_known[key]), columns=cols)
    clf.fit(Xtrain, training_y_known[key])
    predictions = clf.predict(Xtest)
    results_byregion.loc[n, 'Region'] = key
    results_byregion.loc[n, 'F1 Score'] = round(metrics.f1_score(testing_y_known[key], predictions, 
                                                                 average='weighted'), 4)
    results_byregion.loc[n, 'Num Features'] = int(len(cols))
    #print('-'*100,'\n',key)
    #print(metrics.classification_report(testing_y_known[key], predictions, labels=top_groups[key]))

print('-'*100,'\n','Summary of all regions')
print(results_byregion.sort_values('F1 Score', ascending=False))

---------------------------------------------------------------------------------------------------- 
 Summary of all regions
                         Region F1 Score Num Features
3            Sub-Saharan Africa   0.9239            5
6   Central America & Caribbean    0.897            4
0    Middle East & North Africa   0.8858            5
8                 North America   0.8482            5
1                    South Asia   0.8474            5
9                     East Asia   0.8111            8
5                Western Europe   0.7983            4
7                Eastern Europe   0.7519            5
2                Southeast Asia   0.7469            6
4                 South America   0.7425            4
11          Australia & Oceania     0.67            8
10                 Central Asia   0.6654            7


In [29]:
# how could we improve this further? what are the limitations?

# why is the accuracy score in some regions so low? for central/east Asia and Australia, much less data 
# and a very high % of unknown attacks

# the goal is to predict 'Unknown' attacks, but can't check accuracy on the unlabeled attacks 