# Urbanization Data Analysis

In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import plotly.plotly as py
import cufflinks as cf
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr, spearmanr
%matplotlib inline

cf.set_config_file(theme='space',offline=True)
pd.set_option('display.max_colwidth', -1)

## Data Importing

In [8]:
df = pd.read_csv("../data/production/subject/Urbanization.csv").set_index(["Country Code","Year"])
dd = pd.read_csv("../data/production/data_dictionary.csv").set_index("Code").loc[df.columns]
tourism_columns = ['ST.INT.ARVL', 'ST.INT.XPND.MP.ZS', 'ST.INT.XPND.CD', 'ST.INT.DPRT',
       'ST.INT.RCPT.XP.ZS', 'ST.INT.RCPT.CD', 'Tourist Defecit', 'Tourism Net',
       'Tourist Avg Net', 'Population Estimate', 'ST.INT.ARVL.PER.CAPITA',
       'ST.INT.DPRT.PER.CAPITA']

In [9]:
dd

Unnamed: 0_level_0,Indicator Name
Code,Unnamed: 1_level_1
ST.INT.ARVL,"International tourism, number of arrivals"
ST.INT.XPND.MP.ZS,"International tourism, expenditures (% of total imports)"
ST.INT.XPND.CD,"International tourism, expenditures (current US$)"
ST.INT.DPRT,"International tourism, number of departures"
ST.INT.RCPT.XP.ZS,"International tourism, receipts (% of total exports)"
ST.INT.RCPT.CD,"International tourism, receipts (current US$)"
Tourist Defecit,The difference in outbound-inbound tourists for a country
Tourism Net,The difference in tourism recepts-expenditures
Tourist Avg Net,The average net income per tourist
Population Estimate,The UNPD estimated population for the country


## Correlations

In [11]:
corr = df.corr().drop(tourism_columns,axis="columns").loc[tourism_columns]

Unnamed: 0,SP.URB.TOTL.IN.ZS,SP.URB.GROW,EN.URB.MCTY.TL.ZS,EN.URB.LCTY.UR.ZS
ST.INT.ARVL,0.231343,-0.154175,0.102926,-0.278758
ST.INT.XPND.MP.ZS,0.223903,0.026986,0.168162,0.221415
ST.INT.XPND.CD,0.214198,-0.110524,0.141262,-0.256581
ST.INT.DPRT,0.209745,-0.160091,0.136316,-0.208975
ST.INT.RCPT.XP.ZS,0.010882,-0.085271,-0.093987,0.523458
ST.INT.RCPT.CD,0.215581,-0.119828,0.157882,-0.216833
Tourist Defecit,-0.015239,0.03545,-0.060694,-0.036802
Tourism Net,-0.003246,-0.011115,0.021246,0.065531
Tourist Avg Net,0.292374,-0.068552,0.230165,0.07337
Population Estimate,-0.068803,0.051652,-0.017928,-0.266919


In [14]:
corr.iplot(kind='heatmap',colorscale='-rdbu',
          title="Correlations between Tourism indicators and Urbanization",
          zerolinecolor="white",
          dimensions=(640,500),margin=(150,150,150,50))

## Variable Distributions

In [15]:
def draw_histograms(dataframe):
    ax = dataframe.iplot(kind='histogram', subplots=True, shape=(6,3))
    return ax

draw_histograms(df)

### Normalize

In [16]:
df_norm = (df - df.mean()) / (df.max() - df.min())
draw_histograms(df_norm)