# Collecting data


In [85]:
import pandas as pd
import numpy as np
import re


page = 'https://en.wikipedia.org/wiki/List_of_countries_by_firearm-related_death_rate'
wiki_data = pd.read_html(page,header=0)
gun_toters = wiki_data[2]
gun_toters.drop(73, inplace=True)


gun_toters['Country'] = gun_toters['Country'].apply(lambda s: s.split("!")[0].strip())
gun_toters['Deaths'] = pd.to_numeric(gun_toters['Total'])
gun_toters['Guns'] = pd.to_numeric(gun_toters['Guns per 100 hab'], errors='coerce')


gun_toters.drop(['Total',
                 'Guns per 100 hab',
                 'Method of Calculation',
                 'Homicides',
                 'Suicides',
                 'Unintentional',
                 'Undetermined',
                 'Sources and notes'], inplace=True,axis=1)
gun_toters.sort_values(by='Deaths', inplace=True)
gun_toters.dropna(inplace=True)
gun_toters.style

Unnamed: 0,Country,Deaths,Guns,Year
34,Japan,0.06,0.6,2008
60,South Korea,0.08,1.1,2011
54,Romania,0.14,0.7,2012
53,Qatar,0.15,19.2,2004
56,Singapore,0.16,0.5,2006
68,United Kingdom,0.23,6.6,2011
5,Belarus,0.23,7.3,2009
67,Ukraine,0.24,6.6,2009
51,Poland,0.26,1.3,2011
30,India,0.28,4.2,2014


In [None]:
def extract_year(text):
    m = re.search('(\d{4})', text)
    if m is None:
        return np.NaN
    else:
        return pd.to_numeric(m.group(0))

gun_toters['Method of Calculation'] = gun_toters['Method of Calculation'].apply(extract_year)
gun_toters['Homicides'] = pd.to_numeric(gun_toters['Homicides'].apply(extract_year))
gun_toters['Suicides'] = gun_toters['Suicides'].apply(extract_year)
gun_toters['Unintentional'] = gun_toters['Unintentional'].apply(extract_year)
gun_toters['Undetermined'] = gun_toters['Undetermined'].apply(extract_year)

In [None]:
def best_year(row):
    if not np.isnan(row['Method of Calculation']):
        return row['Method of Calculation']
    else:
        return max([row['Homicides'], row['Suicides'], row['Unintentional'], row['Undetermined']])

gun_toters['Year'] = gun_toters.apply(best_year, axis=1)

## World bank data
Let's get some data from the [World bank](http://data.worldbank.org). They have all kinds of interesting information. To do that we need to get the countries by two letter codes first. So we can look at just the things we want.
This turned a bit messy. The library 'pycountry' was the best I could find, but people do love to write names in creative ways. So the first thing we need is a function to keep track of that kind of madness.
But once we are done the country code makes for a good index. Since countries appear only once.

## Country code
The World bank prefers to deal data to those who hand in a list of country codes. Also, the country names in the Wikipedia data does not match the names in the World bank data. But the three letter code seems to work. And the names are similar enough ...

To make this more fun, neither matches the names in the 'pycountry' package. But we are lazy and don't really care. So lets force every name to match the 'pycountry' names. This will make it easier if we want to use that package later. 

In [76]:
import pycountry as pc
def country_code(country):
    country = country \
        .replace("Macedonia","Macedonia, Republic of") \
        .replace("South Korea","Korea, Republic of") \
        .replace("Korea, Rep.","Korea, Republic of") \
        .replace("Venezuela, RB","Venezuela")
    code = 'none found'
    try:
        code = pc.countries.get(name=country)
    except KeyError:
        try:
            code = pc.countries.get(common_name=country)
        except KeyError:
            try:
                code = pc.countries.get(official_name=country)
            except:
                print ("Whooopsie ...\n" + country)
    return code.alpha_3
gun_toters['Country'] = gun_toters['Country'].apply(lambda s: country_code(s))

gun_toters.style

Unnamed: 0,Country,Deaths,Guns,Year
34,JPN,0.06,0.6,2008
60,KOR,0.08,1.1,2011
54,ROU,0.14,0.7,2012
53,QAT,0.15,19.2,2004
56,SGP,0.16,0.5,2006
68,GBR,0.23,6.6,2011
5,BLR,0.23,7.3,2009
67,UKR,0.24,6.6,2009
51,POL,0.26,1.3,2011
30,IND,0.28,4.2,2014


Now we can get that juicy World bank data.

In [77]:
from pandas_datareader import wb
indicators=[
    'NY.GDP.PCAP.KD',    # GDP per capita
    'TX.VAL.TECH.MF.ZS'  # Percent of export that is High-tech
]

wb_values = wb.download(indicator=indicators, country=gun_toters.Country)
wb_values.reset_index(inplace=True)
wb_values.columns = [
    'Country',
    'Year',
    'GDP',
    'High-tech']

wb_values['Country'] = wb_values['Country'].apply(lambda s: country_code(s))

wb_values.style


Unnamed: 0,Country,GDP,High-tech
0,ARG,7924.69,7.81438
1,AUS,47656.8,13.3332
2,AUT,43754.7,15.1465
3,AZE,2488.28,2.79403
4,BEL,42198.1,8.78761
5,BGR,5276.0,4.42279
6,BLR,3772.01,3.10189
7,BOL,1678.05,8.67921
8,BRA,9238.32,12.1306
9,BRB,15205.8,17.3035


We don't really care to have the granularity of years though. So for each country we take the mean of the values. We also need the country code to match up with the other table. So let's make that an index here too. And we should drop the 'Country' column to avoid collisions with the other data set.

In [None]:
wb_values = \
    wb_values \
    .groupby(['Country']) \
    .mean() \
    .reset_index()

wb_values.style

That seems fine. But some of the stuff we want to do will fail on missing values. So we should see if there is anything that could be a problem.

In [78]:
wb_values.isnull().sum()

Country      0
GDP          0
High-tech    1
dtype: int64

In [79]:
missing_value = wb_values[wb_values['High-tech'].isnull()]
missing_value

Unnamed: 0,Country,GDP,High-tech
43,MNE,5215.925249,


Ok ... Montenegro. Not a huge country in tech, from what I can tell. Lets find the countries in the same GDP range and assign something similar. Say the mean of that range.

I have no idea if it's reasonable, but plus minus 1000 in GDP would give a range to work with ...

In [80]:
min_gdp = missing_value.GDP[43] - 1000
max_gdp = missing_value.GDP[43] + 1000
MNE_tech_range = wb_values[(wb_values.GDP > min_gdp) & (wb_values.GDP < max_gdp)]
MNE_tech_range

Unnamed: 0,Country,GDP,High-tech
5,BGR,5276.003003,4.422789
13,COL,5132.672316,5.766862
34,JAM,4903.662668,0.086861
43,MNE,5215.925249,
48,PAN,5810.007791,1.53908
58,SRB,4353.453377,3.749469


In [81]:
MNE_tech_range_average = MNE_tech_range['High-tech'].mean()
MNE_tech_range_average

3.1130122357362118

In [82]:
wb_values.set_value(43,'High-tech', MNE_tech_range_average).style

Unnamed: 0,Country,GDP,High-tech
0,ARG,7924.69,7.81438
1,AUS,47656.8,13.3332
2,AUT,43754.7,15.1465
3,AZE,2488.28,2.79403
4,BEL,42198.1,8.78761
5,BGR,5276.0,4.42279
6,BLR,3772.01,3.10189
7,BOL,1678.05,8.67921
8,BRA,9238.32,12.1306
9,BRB,15205.8,17.3035


### Joining it all up
Now we have the numbers from the World bank and Wikipedia. Both have the three letter country code as 'Country'.

In [83]:
gun_toters = pd.merge(gun_toters, wb_values, on='Country')
gun_toters.style

Unnamed: 0,Country,Deaths,Guns,Year,GDP,High-tech
0,JPN,0.06,0.6,2008,43603.1,23.8378
1,KOR,0.08,1.1,2011,17870.1,32.57
2,ROU,0.14,0.7,2012,6436.92,3.84505
3,QAT,0.15,19.2,2004,62518.3,0.00996628
4,SGP,0.16,0.5,2006,37790.3,56.7709
5,GBR,0.23,6.6,2011,38663.0,26.2167
6,BLR,0.23,7.3,2009,3772.01,3.10189
7,UKR,0.24,6.6,2009,2583.44,5.64661
8,POL,0.26,1.3,2011,9565.55,3.40018
9,IND,0.28,4.2,2014,908.143,5.91653


### Feather
Finally we dump the data to a feather file. This enables trivial, fast and compact data reads, writes and storage.

In [84]:
import feather
path = 'gun_toters.feather'
feather.write_dataframe(gun_toters, path)