In [66]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import collections
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from fastai.structured import *
from fastai.column_data import *
import torch.nn as nn
import torch
import torch.nn.functional as F

In [67]:
# Create dataframe from csv file
PATH = 'data/'
df = pd.read_csv(PATH+'gtd.csv', encoding = "ISO-8859-1",low_memory=False)
df.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [68]:
# GPU is available and enabled
torch.cuda.is_available()
torch.backends.cudnn.enabled

True

In [69]:
# Edited list of columns to process from manual inspection of GTD and its documentation.
# Original was performed on a pre-processed version so there is an extra step here.
proc_col = ['eventid','iyear','imonth','iday','approxdate','extended','resolution',
    'country','region','provstate','city','latitude','longitude',
        'specificity','vicinity','location','summary','crit1','crit2','crit3','doubtterr',
            'alternative','alternative_txt','multiple','success','suicide','attacktype1',
            'attacktype2','attacktype3','targtype1','targsubtype1','corp1','target1','natlty1',
            'targtype2','targsubtype2','corp2','target2','natlty2','targtype3','targsubtype3',
            'corp3','target3','natlty3','gname','motive','guncertain1','guncertain2','guncertain3',
            'individual','nperps','nperpcap','claimed','claimmode','claim2','claimmode2','claim3',
            'claimmode3','compclaim','weaptype1','weapsubtype1','weaptype2','weapsubtype2','weaptype3',
            'weapsubtype3','weaptype4','weapsubtype4','weapdetail','nkill','nkillus','nkillter',
            'nwound','nwoundus','nwoundte','property','propextent','propvalue','propcomment',
            'ishostkid','nhostkid','nhostkidus','nhours','ndays','divert','kidhijcountry','ransom',
            'ransomamt','ransomamtus','ransompaid','ransompaidus','ransomnote','hostkidoutcome',
            'nreleased','INT_LOG','INT_IDEO','INT_MISC','INT_ANY', 'related']

In [70]:
# remove columns
for c in df.columns:
    if c not in proc_col:
        df.drop(c, axis=1,inplace=True)

In [71]:
# further column removals from df inspection
df.drop(columns=['eventid','related','nreleased','summary','alternative', 'alternative_txt',
                'guncertain1', 'guncertain2', 'guncertain3', 'approxdate', 'resolution',
                 'propcomment', 'weapdetail'],inplace=True);

In [72]:
# fill nan values for all columns based on GTD documentation
cat0 = ['approxdate', 'resolution','attacktype2', 'attacktype3','targsubtype1',
       'natlty1','targtype2', 'targsubtype2','natlty2', 'targtype3', 'targsubtype3','natlty3',
       'claimed', 'claimmode', 'claim2', 'claimmode2', 'claim3', 'claimmode3','compclaim', 
       'weapsubtype1', 'weaptype2', 'weapsubtype2', 'weaptype3', 'weapsubtype3', 'weaptype4',
        'weapsubtype4','propextent','ishostkid', 'nhostkid', 'nhostkidus', 'nhours', 'ndays',
       'ransom','hostkidoutcome']
cattext = ['provstate','city','location','corp1','target1','corp2','target2','corp3', 'target3',
          'motive','weapdetail','propcomment','divert', 'kidhijcountry', 'ransomnote']
cont_avg =['latitude', 'longitude','specificity']
cont0 = ['nperpcap','nperps','nkill', 'nkillus', 'nkillter', 'nwound', 'nwoundus', 'nwoundte',
        'propvalue','ransomamt', 'ransomamtus', 'ransompaid', 'ransompaidus']

zero_na_var = df.columns[~df.isnull().any()].tolist()
all_var = cat0+cattext+cont_avg+cont0+zero_na_var

In [73]:
# Sanity check for stray columns
for c in df.columns:
    if c not in all_var: print(c)

In [74]:
# Check manual column lists against df columns
cat0 = [x for x in cat0 if x in df.columns]
cattext = [x for x in cattext if x in df.columns]
cont_avg = [x for x in cont_avg if x in df.columns]
cont0 = [x for x in cont0 if x in df.columns]

# Fill na values
for c in cat0:
    df[c]=df[c].fillna(0)
for c in cattext:
    df[c]=df[c].fillna('Unknown')
for c in cont_avg:
    mean = df[c].mean()
    df[c]=df[c].fillna(mean)
for c in cont0:
    df[c]=df[c].fillna(0)

In [75]:
# Lists categorizing all columns to categorical or continuous
cat_var = zero_na_var+cattext+cat0
cat_var = [c for c in df.columns if c in cat_var]
cont_var = cont0+cont_avg
cont_var = [c for c in df.columns if c in cont_var]

In [76]:
# Cast respective columns to proper type
for v in cat_var: 
    df[v]= df[v].astype('category').cat.as_ordered()
for v in cont_var:
    df[v]=df[v].astype(np.float32)

In [77]:
# Encode target variable and remove from df
df['gname'] = LabelEncoder().fit_transform(df['gname'])
df['gname'] = np.asarray(df['gname'])
cat_var.remove('gname')
# Create dictionary of number of categories for each column
df_cats_dict = { i: len(df[i].cat.categories) for i in cat_var}

In [78]:
df_cats_dict

3310    78306
3050     6575
2863     4551
1481     4287
1087     3351
177      2683
1432     2669
2631     2481
2192     2414
1742     2152
678      2077
595      2024
859      1766
1804     1606
2131     1483
1902     1424
3077     1252
2313     1124
168       975
2202      895
1348      893
1897      830
2874      714
910       639
164       636
1011      614
85        607
2825      571
2071      561
3183      557
        ...  
58          1
2872        1
825         1
3000        1
953         1
1390        1
1081        1
3256        1
1209        1
3384        1
3311        1
1593        1
1721        1
1849        1
1134        1
59          1
2363        1
570         1
1006        1
698         1
878         1
954         1
1082        1
1210        1
2799        1
2671        1
3387        1
1466        1
1850        1
945         1
Name: gname, Length: 3454, dtype: int64

In [79]:
# Determine number of classes for target variable
num_classes = max(df['gname'])+1
# Create train, val, test df splits
df_trn, df_val, df_test = np.split(df.sample(frac=1), [int(.64*len(df)), int(.82*len(df))])
# Reorder indices
df_trn.index = range(len(df_trn.index))
df_val.index = range(len(df_val.index))
df_test.index = range(len(df_test.index))
num_classes

3454

In [82]:
# Balance data by reducing examples wih target 3310, ie, 'Unknown'
df_trn.drop(df_trn.query('gname == 3310').sample(frac=.9).index)

Unnamed: 0,iyear,imonth,iday,extended,country,region,provstate,city,latitude,longitude,...,ransomamt,ransomamtus,ransompaid,ransompaidus,ransomnote,hostkidoutcome,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
2,2014,10,7,0,97,10,Golan Heights,Nahal Sion,33.276978,35.686985,...,0.0,0.0,0.0,0.0,Unknown,0.0,1,1,0,1
4,2012,3,29,0,153,6,Balochistan,Quetta,30.209305,67.018135,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,0,0,0
6,1977,4,29,0,185,8,Gipuzcoa,Tolosa,43.136978,-2.073914,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,1,0,1
7,1984,12,17,0,159,3,Lima,Lima,-12.046378,-77.042793,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,0,0,0
9,2013,9,13,1,92,6,Manipur,Ukhrul,25.095387,94.361649,...,0.0,0.0,0.0,0.0,Unknown,4.0,0,0,0,0
11,2015,12,16,0,97,10,Southern,Ein HaShlosha,31.351725,34.403473,...,0.0,0.0,0.0,0.0,Unknown,0.0,1,0,0,1
13,1985,7,12,0,61,2,Unknown,Tecoluca,13.536835,-88.780586,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,0,0,0
14,2015,1,7,0,95,10,Nineveh,Zummar,36.763592,42.602333,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,1,0,1
15,2009,1,7,1,123,11,Kidal,Unknown,18.449417,1.409841,...,0.0,0.0,0.0,0.0,Unknown,2.0,1,1,0,1
19,1992,8,15,0,185,8,Basque Country,Donostia-San Sebastian,43.320812,-1.984447,...,0.0,0.0,0.0,0.0,Unknown,0.0,0,1,0,1


In [83]:
# df and target variables. Normalized according to training df
df_trn, y_trn, nas, mapper = proc_df(df_trn, 'gname', do_scale= True)
df_val, y_val, nas, mapper = proc_df(df_val, 'gname', do_scale=True,mapper=mapper, na_dict =nas)
df_test, y_test, nas, mapper = proc_df(df_test, 'gname',do_scale=True, mapper=mapper, na_dict= nas)

In [84]:
# Embedding sizes list
sizes = [df_cats_dict[i]+1 for i in cat_var]
emb_szs = [(x, min((x+1)//2, 30)) for x in sizes]

In [85]:
# Custom Pytorch Dataset

class CDataset(Dataset):
    def __init__(self, cats, conts, y= None):
        n = len(cats[0]) if cats else len(conts[0])
        self.cats = np.stack(cats, 1).astype(np.int64) if cats else np.zeros((n,1))
        self.conts = np.stack(conts, 1).astype(np.float32) if conts else np.zeros((n,1))
        self.y = np.zeros((n,1)) if y is None else y
        
    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

    @classmethod
    def from_data_frames(cls, df_cat, df_cont, y=None):
        cat_cols = [c.values for n,c in df_cat.items()]
        cont_cols = [c.values for n,c in df_cont.items()]
        return cls(cat_cols, cont_cols, y)

    @classmethod
    def from_data_frame(cls, df, cat_flds, y=None):
        return cls.from_data_frames(df[cat_flds], df.drop(cat_flds, axis=1), y)

# Custom Pytorch DataLoader
class CModelData(ModelData):
    def __init__(self, path, trn_ds, val_ds, bs, test_ds=None, shuffle=True):
        test_dl = DataLoader(test_ds, bs, shuffle=False, num_workers=1) if test_ds is not None else None
        super().__init__(path, DataLoader(trn_ds, bs, shuffle=shuffle, num_workers=1),
            DataLoader(val_ds, bs, shuffle=False, num_workers=1), test_dl)

    @classmethod
    def from_data_frames(cls, path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df=None, test_y=None):
        test_ds = CDataset.from_data_frame(test_df, cat_flds, test_y) if test_df is not None else None
        return cls(path, CDataset.from_data_frame(trn_df, cat_flds, trn_y),
                    CDataset.from_data_frame(val_df, cat_flds, val_y), bs, test_ds=test_ds)

    @classmethod
    def from_data_frame(cls, path, val_idxs, df, y, cat_flds, bs, test_df=None):
        ((val_df, trn_df), (val_y, trn_y)) = split_by_idx(val_idxs, df, y)
        return cls.from_data_frames(path, trn_df, val_df, trn_y, val_y, cat_flds, bs, test_df=test_df)


In [86]:
# Example dataset and visualization
mydset = CDataset.from_data_frame(df_test,cat_var,y=y_test)
mydset[10]

[array([   46,     8,    14,     1,    81,    10,   616,  3538,     2, 39375,     2,     2,     2,     2,
            1,     2,     1,     1,     1,     1,    14,    77, 20280,  8525,    81,     1,     1,  2361,
         4519,     1,     1,     1,   356,   624,     1, 12831,     1,     2,     1,     2,     1,     1,
            1,     2,     4,     6,     1,     1,     1,     1,     1,     1,     2,     1,     2,     2,
            2,     3,     2,   133,   200,     2,   340,     1,     1,     1,     2,     1]),
 array([ 0.56112,  0.31399, -0.44996, -0.35137,  0.09006, -0.11224, -0.00638, -0.08879, -0.0743 , -0.02323,
        -0.05216, -0.00568, -0.00842, -0.00655, -0.00975, -0.00455], dtype=float32),
 3310]

In [87]:
# Create modeldata 
md = CModelData.from_data_frames(PATH, df_trn, df_val, y_trn, y_val,cat_var, 256, test_df=df_test,
                                 test_y = y_test)


In [88]:
# embedding initializer and custom Pytorch Neural net model
def emb_init(x):
    x = x.weight.data
    sc = 2/(x.size(1)+1)
    x.uniform_(-sc,sc)
    
class CatContInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont, emb_drop, out_sz, szs, drops,
                 use_bn=False):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        for emb in self.embs: emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont=n_emb, n_cont
        
        szs = [n_emb+n_cont] + szs
        self.lins = nn.ModuleList([
            nn.Linear(szs[i], szs[i+1]) for i in range(len(szs)-1)])
        self.bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in szs[1:]])
        for o in self.lins: kaiming_normal(o.weight.data)
        self.outp = nn.Linear(szs[-1], out_sz)
        kaiming_normal(self.outp.weight.data)

        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn = use_bn

    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.lins, self.drops, self.bns):
            x = F.relu(l(x))
            if self.use_bn: x = b(x)
            x = d(x)
        x = self.outp(x)
        # softmax output with NLLLoss instead of CrossEntropy
        # x = F.log_softmax(x, dim=-1)
        return x

In [89]:
model = CatContInputModel(emb_szs, len(cont_var), 0.04, num_classes, [200, 120], [0.4,0.4],use_bn=True)


In [90]:
# neural net training and validation function for GPU only
def embedding_train_val(model, md, optimizer, criterion, epochs):  
    
    model.cuda()
    for epoch in range(epochs):
        total_trn_loss = 0.0
        total_val_loss = 0.0
        
        model.train()
        for data in iter(md.trn_dl):
            
            # get inputs
            x_cats, x_conts, y = data

            # wrap with variable
            x_cats, x_conts, y = Variable(x_cats.cuda()), Variable(x_conts.cuda()), Variable(y.cuda())

            # zero the parameter gradients
            optimizer.zero_grad()
            
            # forward + backward + optimize
            outputs = model(x_cats, x_conts)
            if torch.sum( (y.data >= num_classes).long() + (y.data < 0).long()) > 0:
                import pdb; pdb.set_trace()
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            total_trn_loss += loss.data[0]
        print('Epoch: {a:d}, Training Loss: {b:.4f}'.format(a=epoch, b = total_trn_loss))
        
        model.eval()
        for data in iter(md.val_dl):
            
            # get inputs
            x_cats, x_conts, y = data

            # wrap with variable
            x_cats, x_conts,y = Variable(x_cats.cuda()), Variable(x_conts.cuda()), Variable(y.cuda())
 
            outputs = model(x_cats, x_conts)
            if torch.sum( (y.data >= num_classes).long() + (y.data < 0).long()) > 0:
                import pdb; pdb.set_trace()
            loss = criterion(outputs, y)
            
            total_val_loss += loss.data[0]
        
        print('Epoch: {a:d}, Validation Loss: {b:.4f}'.format(a=epoch, b = total_val_loss))

In [91]:
opt = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-4)
# opt = optim.SGD(model.parameters(), lr=1e-3, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False)
epochs = 25
crit = F.cross_entropy
embedding_train_val(model, md, opt, crit, epochs)

Epoch: 0, Training Loss: 1118.0893
Epoch: 0, Validation Loss: 236.7422
Epoch: 1, Training Loss: 502.1240
Epoch: 1, Validation Loss: 1619.9025
Epoch: 2, Training Loss: 370.2684
Epoch: 2, Validation Loss: 528.7064
Epoch: 3, Training Loss: 285.5934
Epoch: 3, Validation Loss: 243.1405
Epoch: 4, Training Loss: 226.6992
Epoch: 4, Validation Loss: 190.5265
Epoch: 5, Training Loss: 184.9065
Epoch: 5, Validation Loss: 277.7924
Epoch: 6, Training Loss: 153.9602
Epoch: 6, Validation Loss: 144.7371
Epoch: 7, Training Loss: 133.8155
Epoch: 7, Validation Loss: 170.9833
Epoch: 8, Training Loss: 115.3995
Epoch: 8, Validation Loss: 825.3287
Epoch: 9, Training Loss: 103.7085
Epoch: 9, Validation Loss: 115.9625
Epoch: 10, Training Loss: 95.3253
Epoch: 10, Validation Loss: 139.4821
Epoch: 11, Training Loss: 86.5102
Epoch: 11, Validation Loss: 191.2791
Epoch: 12, Training Loss: 80.2917
Epoch: 12, Validation Loss: 900.5632
Epoch: 13, Training Loss: 74.2525
Epoch: 13, Validation Loss: 465.8604
Epoch: 14, Tra

In [94]:
correct = 0
total = 0.0

for data in iter(md.test_dl):
    
    # get inputs
    x_cats, x_conts, y = data
    
    # wrap with variable
    x_cats, x_conts = Variable(x_cats), Variable(x_conts)
 
    outputs = model(x_cats, x_conts)
    _, predicted = torch.max(outputs.data, 1)
    
    # predicted = predicted#.float()
    
    total += y.size(0)
    correct += (predicted == y).sum()
    
print('Accuracy of the network on the test set: '+'{:.4f}'.format(100*correct/total)+'%')

Accuracy of the network on the test set: 86.1233%
