In [42]:
#CHALLENGE: GLOBAL TERORISM ATTACK. Data provided from the website: https://www.start.umd.edu/gtd/
#NOTE: the downloaded data should be saved with the extension .csv


#==============================================================================
# Steps to perform the task:
#     1.Data gathering: importing the data
#     2.Data pre-processing: perform data cleaning (e.g. incomplete data),attribute selection, etc
#     3.Data mining: split the data into training and test data, apply data mining techniques on the test data
#     4.Perform evaluation of the data mining techniques (e.g. accuracy of the data technique)
#==============================================================================

In [43]:
#Import packages
import numpy as np
import pandas as pd

In [44]:
#Import file
data_terr_attack = pd.read_csv("C:/Users/Milos Milojevic/Documents/Python Scripts/gtd_1970_1994.csv",encoding='ISO-8859-1',low_memory=False)

In [45]:
# Remove rows of the dataframe for which class attribute (gname) is not defined
data_terr_attack_90 = data_terr_attack[data_terr_attack.gname != "Unknown"]

In [46]:
#NOTE: Reduce the amount of data stored in a dataframe since there is no enough RAM memory to process all data on the used laptop  
data_terr_attack_90 = data_terr_attack_90[data_terr_attack_90.iyear >= 1991]

In [47]:
# Limit long strings
def shortening(df):
    
    df.loc[df['weaptype1_txt'] == 'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)', 'weaptype1_txt'] = 'Vehicle'
    df.loc[df['propextent_txt'] == 'Minor (likely < $1 million)','propextent_txt'] = 'Minor (< $1 million)'
    df.loc[df['propextent_txt'] == 'Major (likely > $1 million but < $1 billion)','propextent_txt'] = 'Major (< $1 billion)'
    df.loc[df['propextent_txt'] == 'Catastrophic (likely > $1 billion)', 'propextent_txt'] = 'Catastrophic (> $1 billion)'

    return df
data_terr_attack_90 = shortening(data_terr_attack_90)


In [48]:
#Values -9 are unknowns replaced with nan
def nines(df):
    for i in ["ishostkid", "property","INT_LOG","INT_IDEO","INT_MISC","INT_ANY","claimed"]:
         df.loc[df[i] == -9,i]=df.loc[df[i] == -9,i].replace(-9,np.nan)
    return df  
data_terr_attack_90 = nines(data_terr_attack_90)

In [49]:
#Values -99 are unknowns replaced with nan
def ninty_nines(df):
    for i in ["nhostkid","nhostkidus","nhours","nperpcap","nperps"]: 
        df.loc[df[i] == -99,i]=df.loc[df[i] == -99,i].replace(-99,np.nan)
    return df  
data_terr_attack_90 = ninty_nines(data_terr_attack_90)

In [50]:
#Checking the number of nan values per column, in order to select appropriate atributes
feature_null_counts = data_terr_attack_90.apply(lambda col: col.isnull().sum(), axis = 0)
print (feature_null_counts)

eventid                  0
iyear                    0
imonth                   0
iday                     0
approxdate            8539
extended                 0
resolution            8391
country                  0
country_txt              0
region                   0
region_txt               0
provstate             1621
city                     0
latitude               538
longitude              538
specificity              0
vicinity                 0
location              8492
summary               8453
crit1                    0
crit2                    0
crit3                    0
doubtterr                0
alternative           7029
alternative_txt       7029
multiple                 0
success                  0
suicide                  0
attacktype1              0
attacktype1_txt          0
                      ... 
propextent            7663
propextent_txt        7663
propvalue             7682
propcomment           8437
ishostkid                1
nhostkid              8195
n

In [52]:
#Since the observed dataframe is composed of text attributes and numeric attributes, the pre-processing step for these attributes is diferent. 
#Thus, attributes are first separated into text attributes and the numeric attributes. By observing the values of 'feature_null_counts', attrbibutes
#,both numeric and text attributes, are selected only in case there is a significant number of non nan values. Additionally some of the attributes like for example
#'targettyp1' and 'targettype1_txt' are saying the same thing in different form, thus only one of them is choosen 
att_num = ['iyear','imonth','iday','extended','country','region','specificity','vicinity','crit1','crit2','crit3','doubtterr','alternative','multiple','success','suicide','attacktype1','targtype1','targsubtype1','natlty1',
            'guncertain1','individual','nperps','weaptype1','weapsubtype1','nkill','nkillus','nkillter','nwound','nwoundus','nwoundte','property','propextent','propvalue','ishostkid','ransom','INT_LOG','INT_IDEO','INT_MISC','INT_ANY']

att_text = ['provstate','city','location','corp1','target1','dbsource','weapdetail']

In [53]:
#Define a class (gname)
class_gname = data_terr_attack_90.gname

In [54]:
#Define observed dataframe
data_terr_attack_90 = data_terr_attack_90[att_num+att_text]

In [55]:
#Import packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.cross_validation import train_test_split

In [56]:
#Use 'train_test_split' to split the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(data_terr_attack_90, class_gname, test_size = 0.33, random_state = 1)

In [57]:
#Create an empty list in which different pipelines will be stored
pipelines = []

In [58]:
#Define a function that processes the numeric attributes
def process_att_num(df):
    df = df[att_num]
    
    #Select columns with numeric attributes
    numeric_col = ['iyear','imonth','iday','nperps','nkill','nkillus','nkillter','nwound','nwoundus','nwoundte','propvalue']
   
    #Replace nulls with medains
    for i in numeric_col:
        df.ix[df[i].isnull(), i] = df[i].median()
   
    #Seclet binary columns
    binary_col = ['extended','country','region','specificity','vicinity','crit1','crit2','crit3','doubtterr','alternative','multiple','success','suicide','attacktype1','targtype1',
                  'targsubtype1','natlty1','guncertain1','individual','weaptype1','weapsubtype1','property','propextent','ishostkid','ransom','INT_LOG','INT_IDEO','INT_MISC','INT_ANY']
     
    #Replace null values with a radnom sample
    for i in binary_col:
        df.loc[df[i].isnull(), i] = np.random.choice([0,1],size = df[i].isnull().sum())       
    return df    

In [59]:
#Define a function for text attributes that fills all nan values with '' 
def process_att_text(col):
    
    def nest_function(df):

        return df[col].fillna("")
    
    return nest_function

In [60]:
# Build a Document Term Matrix using CountVectorizer for all text attributes defined in att_text list
for i in att_text:
    #FunctionTransformer turns any function into a transformer
    pipeline = make_pipeline(FunctionTransformer(process_att_text(i),validate = False),CountVectorizer(decode_error = 'ignore'))
    #Add pipeline to pipelines list
    pipelines.append(pipeline)

#Add the numeric data into the pipeline
pipelines.append(FunctionTransformer(process_att_num, validate = False))

#Make union of the pipelines
union = make_union(*pipelines)

In [61]:
#Fit and transform the training data
X_train_new = union.fit_transform(X_train)
#Transform test data
X_test_new = union.transform(X_test)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [66]:
#==============================================================================
# #Removing features (attributes) with low variance  
#==============================================================================
#VarianceThreshold removes all features whose variance doesn’t meet a predefined threshold
from sklearn.feature_selection import VarianceThreshold
# Set threshold to 0.1
sel = VarianceThreshold(threshold = 0.1)
sel.fit(X_train_new.toarray())
X_train_update=sel.transform(X_train_new)
# Subset features
X_test_update=sel.transform(X_test_new)

In [67]:
#Import packages
from sklearn.grid_search import GridSearchCV #for tuning the hyper-parameters of the estimator
from sklearn.metrics import accuracy_score #for evaluating the accuracy of the classifier
from sklearn.model_selection import cross_val_score #for evaluating a score by cross-validation
#Create an empty dictionary for storing the accuracy scores from different data mining techniques
accuracy_dict={}

In [68]:
#==============================================================================
# 1.K-nearest neighbour (KNN) Classifier 
#http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
#==============================================================================
#Import packages 
from sklearn.neighbors import KNeighborsClassifier
#Check parameters of the classifier and their default values
KNeighborsClassifier().get_params
#Perform tuning hyper-parameters of the etsimator using GridSearchCV
n_list = list(range(3, 30))
#Perform GridSearchCV for classifier
knn_gscv= GridSearchCV(KNeighborsClassifier(), param_grid = {'n_neighbors':n_list})
#Perform fit method
knn_gscv.fit(X_train_update, y_train)
#Check the best parameter
print (knn_gscv.best_params_)




{'n_neighbors': 3}


In [70]:
#Create an instance of the model
knn_model = KNeighborsClassifier(n_neighbors=3) 
#Fit the model using training data X_train_update and y_train as target values
knn_model.fit(X_train_update,y_train)
#Predict the class labels for the test data X_test_update
knn_predict = knn_model.predict(X_test_update)
#Check accuracy classifications score
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
#and store it in the accuracy dictionary accuracy_dict
accuracy_dict['knn'] = accuracy_score(y_test, knn_predict)
#Perfrom cross-validation
knn_score = cross_val_score(knn_model, X_test_update, y_test, cv = 10)



In [71]:
#==============================================================================
# 2. Naive (Gaussian) Bayesian Classifier 
#http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB
#==============================================================================
#Import packages 
from sklearn.naive_bayes import GaussianNB
#Create an instance of the model
naive_bay_model = GaussianNB()
#Fit the model using training data X_train_update and y_train as target values
naive_bay_model.fit(X_train_update.toarray(),y_train) #In order to avoid getting the "TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array." we use X_train_dtm and we write X_train_dtm.toarray()
#Predict the class labels for the test data X_test_update
naive_bay_predict = naive_bay_model.predict(X_test_update.toarray())
#Check accuracy classifications and store it in the accuracy dictionary accuracy_dict
accuracy_dict['naive_bay'] = accuracy_score(y_test, naive_bay_predict)
#Perfrom cross-validation
naive_bay_score = cross_val_score(naive_bay_model, X_test_update.toarray(), y_test, cv = 10)



In [72]:
#==============================================================================
# 3.Decision Tree Classifier 
#http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
#====================================================== ========================
#Import packages
from sklearn.tree import DecisionTreeClassifier
#Check parameters of the classifier
DecisionTreeClassifier().get_params
#Perform GridSearchCV for classifier
dec_tr_gscv = GridSearchCV(DecisionTreeClassifier(),param_grid = {'criterion': ['gini','entropy']})
#Fit the model using training data X_train_update and y_train as target values
dec_tr_gscv.fit(X_train_update, y_train)
#Check the best parameter
print (dec_tr_gscv.best_params_)



{'criterion': 'entropy'}


In [73]:
#Create an instance of the model
dec_tr_model = DecisionTreeClassifier(criterion='entropy') 
#Fit the model using training data X_train_update and y_train as target values
dec_tr_model.fit(X_train_update,y_train)
#Predict the class labels for the test data X_test_update
dec_tr_predict = dec_tr_model.predict(X_test_update)
#Check accuracy classifications and store it in the accuracy dictionary accuracy_dict
accuracy_dict['dec_tr'] = accuracy_score(y_test, dec_tr_predict)
#Perform cross-validation
dec_tr_score = cross_val_score(dec_tr_model, X_test_update, y_test, cv = 10)



In [80]:
#==============================================================================
# 4.Linear Support Vector Classification
#==============================================================================
#Import packages
from sklearn.svm import LinearSVC
#Create an instance of the model
lin_svc_model = LinearSVC()
#Fit the model using training data X_train_update and y_train as target values
lin_svc_model.fit(X_train_update,y_train)
#Predict the class labels for the test data X_test_update
lin_svc_predict = lin_svc_model.predict(X_test_update)
#Check accuracy classifications and store it in the accuracy dictionary accuracy_dict
accuracy_dict['lin_svc'] = accuracy_score(y_test, lin_svc_predict)
#Perfrom cross-validation
lin_svc_score = cross_val_score(lin_svc_model, X_test_update, y_test, cv = 10)



In [94]:
#==============================================================================
# 5.Logistic Regression Classifier    
#==============================================================================
#Import packages
from sklearn.linear_model import LogisticRegression
#Create an instance of the model
log_reg_model = LogisticRegression() 
#Fit the model using training data X_train_update and y_train as target values
log_reg_model.fit(X_train_update,y_train)
#Predict the class labels for the test data X_test_update
log_reg_predict = log_reg_model.predict(X_test_update)
#Check accuracy classifications and store it in the accuracy dictionary accuracy_dict
accuracy_dict['log_reg']= accuracy_score(y_test, log_reg_predict) 
#Perfrom cross-validation
log_reg_score = cross_val_score(log_reg_model, X_test_update, y_test, cv = 10) 



In [96]:
#==============================================================================
# Evaluate classifiers by observing their accuracy (accuracy_dict) and cross-validation (score_table)
#==============================================================================
#Create dataframe for evaluating a score by cross-validation 
score_table = pd.DataFrame(columns = ['model', 'cv_10'])
#Define a list of evaluated models
models = ['KNN Classifier', 'Naive Bayesian', 'Decision Tree','Linear SVM','Logistic Regression']
#Insert a mean value of each model
score_list = [knn_score.mean(),naive_bay_score.mean(),dec_tr_score.mean(),lin_svc_score.mean(),log_reg_score.mean()]

for model, n, score in zip(models, np.arange(len(models)), score_list):
    score_table.loc[n,'model'] = model
    score_table.loc[n,'cv_10'] = score 
    
#Print values from accuracy_dict    
for i in accuracy_dict:
    print (i, accuracy_dict[i])  
    
#Evaluating the estimator performance using cross validation
print (score_table)    
    
#By observing the accuracy score and cross-validatio score of the techniques it can be seen that technique dec_tr provides the best result  

knn 0.55835402625
naive_bay 0.107484923732
dec_tr 0.639233770841
lin_svc 0.405462930117
log_reg 0.405462930117
                 model      cv_10
0       KNN Classifier   0.535835
1       Naive Bayesian  0.0281716
2        Decision Tree   0.702991
3           Linear SVM  0.0825071
4  Logistic Regression   0.281156
