In [2]:
import pandas as pd
from pathlib import Path
import scipy.stats as stat
import numpy as np
from statistics import median
import matplotlib.pyplot as plt
import random
%matplotlib notebook
plt.style.use('ggplot')

In [3]:
path_to_data = "./data/EDTG_Data.xls"

In [4]:
df = pd.read_excel(path_to_data)  # load up the data
df.head() # and let's just take a look at it

Unnamed: 0,gname,gid,year,end,duration,base,num_base,mul_bases,EAP,ECA,...,num_enemies,fate_leader,total_deaths,total_injuries,terr_deaths,terr_injuries,nonterr_deaths,nonterr_injuries,terr_casualties,nonterr_casualties
0,1920 Revolution Brigades,1,2003,0,1,Iraq,1,0,0,0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1920 Revolution Brigades,1,2004,0,2,Iraq,1,0,0,0,...,,0.0,15.0,19.0,1.0,0.0,14.0,19.0,1.0,33.0
2,1920 Revolution Brigades,1,2005,0,3,Iraq,1,0,0,0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1920 Revolution Brigades,1,2006,0,4,Iraq,1,0,0,0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1920 Revolution Brigades,1,2007,1,5,Iraq,1,0,0,0,...,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
right_wingers = df[
    df["right"] == 1
]

left_wingers=df[
    df['left'] == 1
]

right_wingers.head() #Al-Arifeen, huh! They're pretty devoutly religious.
left_wingers.head()

Unnamed: 0,gname,gid,year,end,duration,base,num_base,mul_bases,EAP,ECA,...,num_enemies,fate_leader,total_deaths,total_injuries,terr_deaths,terr_injuries,nonterr_deaths,nonterr_injuries,terr_casualties,nonterr_casualties
5,23rd of September Communist League,2,1973,0,1,Mexico,1,0,0,0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,23rd of September Communist League,2,1974,0,2,Mexico,1,0,0,0,...,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,23rd of September Communist League,2,1975,0,3,Mexico,1,0,0,0,...,,1.0,32.0,4.0,0.0,0.0,32.0,4.0,0.0,36.0
8,23rd of September Communist League,2,1976,0,4,Mexico,1,0,0,0,...,,0.0,24.0,5.0,0.0,0.0,24.0,5.0,0.0,29.0
9,23rd of September Communist League,2,1977,0,5,Mexico,1,0,0,0,...,,2.0,7.0,0.0,0.0,0.0,7.0,0.0,0.0,7.0


In [6]:
# I want to know whether, of right-wing groups, the religiously motivated ones are significantly different 
# in terms of longevity, lethality, or membership, than non-religious ones.

In [7]:
# first lets figure out some basic things about the population that we're examiniung
pop_longevities = {}
unique_groups = df['gname'].unique()
for group in unique_groups:
        entries = df[
            df['gname'] == group
        ]['duration']
        
        pop_longevities[group] = entries.max()

p_std_dev = stat.tstd(list(pop_longevities.values()))
print(p_std_dev) # this aligns with findings that many terrorist groups are 'one-hit wonders'

12.270618829998693


In [8]:
# I want to begin by figuring out their respective average longevities, but first I'm going to clean the data
# into smaller sub-tables so that I don't have to worry about the duplicates
uq_rw = right_wingers['gname'].unique()
r_longevities = {}
for group in uq_rw:
    group_entries = right_wingers[
       right_wingers['gname'] == group
    ]['duration']
    
    r_longevities[group] = group_entries.max()
    
r_std_dev = stat.tstd(list(r_longevities.values()))  

uq_lw = left_wingers['gname'].unique()
l_longevities = {}
for group in uq_lw:
    group_entries = left_wingers[
        left_wingers['gname'] == group
    ]['duration']
    
    l_longevities[group] = group_entries.max()
l_std_dev = stat.tstd(list(l_longevities.values()))

print("right-wing stdev:", r_std_dev)
print("left-wing stdev:", l_std_dev)

right-wing stdev: 9.34025353694715
left-wing stdev: 11.592012533686798


In [9]:
# it makes sense that the standard deviation would be smaller for samples than for the population
# now i want to know the population and sample means

In [10]:
pop_mean = np.mean(list(pop_longevities.values()))
l_mean = np.mean(list(l_longevities.values()))
r_mean = np.mean(list(r_longevities.values()))

print("pop_mean", pop_mean, "left_mean", l_mean, "right_mean", r_mean)

pop_mean 12.217105263157896 left_mean 11.546875 right_mean 7.628571428571429


In [11]:
# initial impression: both left-wing groups and right-wing groups are less long-lived than groups in the general
# population, but the right-wing groups are significantly less durable than both
# the data won't follow a normal distribution by the previous observation that some terrorist groups last for a very
# long time, while the majority are a flash in the pan

In [12]:
pop_mode = stat.mode(list(pop_longevities.values()))[0]
pop_median = median(list(pop_longevities.values()))
print("pop_median", pop_median, "pop_mode", pop_mode)

pop_median 8.0 pop_mode [1]


In [13]:
x = []
y = []
sorted_longevities = list(pop_longevities.items())
sorted_longevities.sort(key=lambda a: a[1])
for g, l in pop_longevities.items():
    x.append(g)
    y.append(l)

print("data set contains", len(x), "groups")
plt.bar(x, y, color='green')
plt.xlabel('group name')
plt.ylabel('group longevity')
plt.show()

data set contains 760 groups


<IPython.core.display.Javascript object>

In [14]:
distributions = [
    'norm',
    'expon',
    'logistic',
    'gumbel',
    'gumbel_l',
    'gumbel_r',
    'extreme1'
]

fits = []

for d in distributions:
    statistic, crits, sigs = stat.anderson([n for g,n in sorted_longevities], d)
    fits.append([d, statistic, crits, sigs])
for fit in fits:
    print(fit)

['norm', 43.792171731285634, array([0.573, 0.653, 0.783, 0.913, 1.086]), array([15. , 10. ,  5. ,  2.5,  1. ])]
['expon', 7.116966439458679, array([0.921, 1.077, 1.34 , 1.605, 1.955]), array([15. , 10. ,  5. ,  2.5,  1. ])]
['logistic', 33.88881211239004, array([0.426, 0.563, 0.66 , 0.769, 0.906, 1.01 ]), array([25. , 10. ,  5. ,  2.5,  1. ,  0.5])]
['gumbel', 65.70649681036707, array([0.471, 0.632, 0.752, 0.871, 1.031]), array([25. , 10. ,  5. ,  2.5,  1. ])]
['gumbel_l', 65.70649681036707, array([0.471, 0.632, 0.752, 0.871, 1.031]), array([25. , 10. ,  5. ,  2.5,  1. ])]
['gumbel_r', 25.279798661224163, array([0.471, 0.632, 0.752, 0.871, 1.031]), array([25. , 10. ,  5. ,  2.5,  1. ])]
['extreme1', 65.70649681036707, array([0.471, 0.632, 0.752, 0.871, 1.031]), array([25. , 10. ,  5. ,  2.5,  1. ])]


In [15]:
ks_fits = []
for d in distributions[:3]:
    ks_fits.append((d, *stat.kstest([n for g,n in sorted_longevities], d, N=760)))
for fit in ks_fits:
    print(fit)

('norm', 0.8413447460685429, 0.0)
('expon', 0.725191032552861, 0.0)
('logistic', 0.741323393767356, 0.0)


In [16]:
# so unfortunately, none of these metrics will be good for our dataset
# let's see if left and right at least approximate being drawn from a similar distribution, 
# even if we don't have a nice, clean model for what that distribution will look like

In [17]:
cross_fit = stat.ks_2samp(
    list(l_longevities.values()),
    list(r_longevities.values())
)
print(cross_fit)

KstestResult(statistic=0.22723214285714285, pvalue=0.07925560208366093)


In [18]:
# we'll try wilcoxon just to be safe
# I also want to know how large each of these samples is

print(len(list(l_longevities.values())), "groups in the left-wing sample")
print(len(list(r_longevities.values())), "groups in the right-wing sample")

# now that's interesting! There are way more left-wing groups overall in the dataset

l_sample = random.sample(list(l_longevities.values()), 35)

cross_fit = stat.wilcoxon(
    l_sample,
    list(r_longevities.values())
)

print(cross_fit)


192 groups in the left-wing sample
35 groups in the right-wing sample
WilcoxonResult(statistic=189.5, pvalue=0.16328522573700166)


In [43]:
pop_num_bases = {}

for g in unique_groups:
    num_bases = df[
        df['gname'] == g
    ]['num_base']
    
    pop_num_bases[g] = num_bases.max()

correlation = stat.spearmanr(
    list(pop_num_bases.values()),
    list(pop_longevities.values())
)

print(correlation)

SpearmanrResult(correlation=0.19851457404079878, pvalue=3.415637190656142e-08)


In [44]:
# I want to take a small detour to figure out if there exists a correlation between number of home_bases for
# terrorist groups and their longevity 
# What we find is that there is an extremely significant correlation between a terrorist group's longevity and the
# numberof countries in which they can bases their operations