# This notebook is used for data wrangling. 
### The main purpose is to create a singular curated dataset which can be used for exploratory data analysis

In [10]:
# This import registers the 3D projection, but is otherwise unused.
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#------------GLOBALS-------------
area_rows_to_drop = ['North East', 'North West', 'East Midlands', 'West Midlands',
                   'East', 'London', 'South East', 'South West', 'England', 'Wales',
                   'Scotland', 'Great Britain', 'Inner London', 'Outer London', 'City of London']

RAW_DATA_PATH = "~/library/data/raw/"
PROC_DATA_PATH = '~/library/data/processed/'

business_cols_to_drop = ['births','1_year_survival_number', '1_year_survival_rate', 
                         '2_year_survival_number', '2_year_survival_rate', '4_year_survival_number', 
                         '4_year_survival_rate' ,'5_year_survival_number', '5_year_survival_rate']
    
#------------FUNCTIONS-------------
def colToDrop(df):
    col_length = len(df.columns)
    arr = []
    for index in range(2, col_length):
        if df.columns[index].find('Jan') != -1:
            if index + 2 < col_length:
                index += 2
            pass
        arr.append(df.columns[index])    
    return arr


def replaceCol(df):
    col_length = len(df.columns)
    mapping = {
        df.columns[0]:'code', 
        df.columns[1]:'area'
    }
    
    for index in range(2, col_length):
        if index%2!=0:
            mapping[df.columns[index]] = '+/- Difference'
    return mapping

def my_csv(file, path, encoding='utf-8', sep=',', *args, **kwargs):
    try:
        if path == 1:
            path = PROC_DATA_PATH
        elif path == 0:
            path = RAW_DATA_PATH
    except:
        print("Pick either 1 for the processed data path or 0 for the raw data path.")
    
    return pd.read_csv(path + file, encoding=encoding, *args, **kwargs)

#### Here we get all datasets we are going to use in our analysis

In [48]:
df_1 = my_csv('business-survival-rates.csv', path=0, na_values=[':'])
df_2 = my_csv('Jobs and Job Density.csv', path=0) 
df_3 = my_csv('mb-unemployment-rates.csv', path=0, na_values = [' ', '-'])
df_4 = my_csv('processed_nvq4-qualifications-overtime.csv', path=1)
df_5 = my_csv('processed_youth-claimants-rate.csv', path=1)
df_6 = my_csv('processed_no-qualifications-overtime.csv', path=1)
df_7 = my_csv('processed_qualifications-average-2014.csv', path=1)

#### Below for each dataset we drop columns, melt dataframes, and rename the columns so that we can merge on the same values

In [49]:
df_1.drop(business_cols_to_drop, inplace=True, axis=1)

In [50]:
df_1.dropna(inplace=True, axis=0, how='any')
df_3.dropna(inplace=True, axis=0, how='any')
mapping = replaceCol(df_3)
df_3.rename(columns=mapping, inplace=True)
df_3 = df_3.reset_index(drop=True)
df = pd.merge(df_1, df_2, on=['code','area', 'year'])

In [51]:
df_3.drop(colToDrop(df_3), inplace=True, axis=1)

In [52]:
df_3 = df_3.melt(id_vars=["code", "area"], 
        var_name="Date", 
        value_name="Value")

In [53]:
df_3

Unnamed: 0,code,area,Date,Value
0,E09000002,Barking and Dagenham,Jan 2004-Dec 2004,8.3
1,E09000003,Barnet,Jan 2004-Dec 2004,6.3
2,E09000004,Bexley,Jan 2004-Dec 2004,4.1
3,E09000005,Brent,Jan 2004-Dec 2004,9.1
4,E09000006,Bromley,Jan 2004-Dec 2004,4.7
5,E09000007,Camden,Jan 2004-Dec 2004,7.6
6,E09000008,Croydon,Jan 2004-Dec 2004,6.2
7,E09000009,Ealing,Jan 2004-Dec 2004,6.5
8,E09000010,Enfield,Jan 2004-Dec 2004,6.6
9,E09000011,Greenwich,Jan 2004-Dec 2004,8.0


#### Lose the months and get only the year

In [18]:
df_3['Date'] = df_3['Date'].apply(lambda x: [int(s) for s in str.split(x) if s.isdigit()][0])

In [19]:
df_3.rename(columns={'Date':'year', 'Value':'unemployment_rate'}, inplace=True)

In [20]:
df = pd.merge(df, df_3, on=['code','area', 'year'])

In [21]:
df['unemployment_rate'] = df['unemployment_rate'].astype('float')

In [22]:

df = df[~df['area'].isin(area_rows_to_drop)]

In [23]:
df

Unnamed: 0,code,area,year,3_year_survival_number,3_year_survival_rate,number_of_jobs,job_density,unemployment_rate
0,E09000002,Barking and Dagenham,2004,290.0,60.4,53000,0.51,8.3
1,E09000003,Barnet,2004,1385.0,56.9,135000,0.63,6.3
2,E09000004,Bexley,2004,565.0,60.1,85000,0.61,4.1
3,E09000005,Brent,2004,995.0,55.3,119000,0.65,9.1
4,E09000006,Bromley,2004,1015.0,63.8,126000,0.67,4.7
5,E09000007,Camden,2004,1810.0,63.7,278000,1.82,7.6
6,E09000008,Croydon,2004,1000.0,61.3,153000,0.69,6.2
7,E09000009,Ealing,2004,1210.0,57.2,136000,0.63,6.5
8,E09000010,Enfield,2004,855.0,61.1,110000,0.59,6.6
9,E09000011,Greenwich,2004,590.0,60.5,77000,0.50,8.0


In [29]:
df_4.drop(['Unnamed: 0'], axis=1, inplace=True)

In [30]:
df_4.dropna(how='any', inplace=True)

In [31]:
df_4.rename(columns={'Code':'code', 'Area':'area'}, inplace=True)

In [32]:
df_4 = df_4.melt(id_vars=["code", "area"], 
        var_name="year", 
        value_name="NVQ4_rate")

In [33]:
df_4['year'] = df_4['year'].astype('int64')

In [34]:
df = pd.merge(df, df_4, on=['area', 'year'])

In [35]:
df.rename(columns={'code_x':'code'}, inplace=True)

In [36]:
df.drop(['code_y'], axis=1, inplace=True)

In [37]:
df_5.drop(['Unnamed: 0'], axis=1, inplace=True)

In [38]:
df_5.dropna(how='any', inplace=True)

In [39]:
df_5.rename(columns={'Code':'code', 'Area':'area'}, inplace=True)

In [40]:
df_5 = df_5[~df_5['area'].isin(area_rows_to_drop)]

In [41]:
df_5 = df_5.melt(id_vars=["code", "area"], 
        var_name="year", 
        value_name="youth_unemployment_rate")

In [42]:
df_5['year'] = df_5.loc[:,'year'].astype('int64')

In [43]:
df_5.dtypes

code                        object
area                        object
year                         int64
youth_unemployment_rate    float64
dtype: object

In [44]:
df = pd.merge(df, df_5, on=['area', 'year', 'code'])

In [45]:
df_6.drop(['Unnamed: 0'], axis=1, inplace=True)
df_6.dropna(how='any', inplace=True)
df_6.rename(columns={'Code':'code', 'Area':'area'}, inplace=True)
df_6 = df_6.melt(id_vars=["code", "area"], 
        var_name="year", 
        value_name="no_qualif_rate")
df_6['year'] = df_6['year'].astype('int64')
df = pd.merge(df, df_6, on=['area', 'year'])

In [46]:
df.drop(['code_y'], axis=1, inplace=True)
df.rename(columns={'code_x':'code'}, inplace=True)

In [47]:
df.to_csv(PROC_DATA_PATH + 'borough-unemployment-and-business-data.csv')