In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import pearsonr

In [None]:
def load_data():
    df = pd.DataFrame()
    
    for year in range(2002, 2016 + 1):
        filepath = 'births_data/regions_pl_uro_%d_00_2p.csv' % year
        curr_df = pd.read_csv(filepath)
        
        del curr_df['id']
        del curr_df['nieustalona']
        curr_df['year'] = year
    
        df = df.append(curr_df)
        
    df = df.reset_index()
    del df['index']
    
    # Fix column type for number of births
    for col in (*range(1, 10), '10 i dalsze'):
        df[str(col)] = df[str(col)].astype(int)
    
    # Remove whitespaces for regions
    regions_fixed = [r.strip() for r in df['region']]
    df['region'] = regions_fixed
    
    return df


In [None]:
births_df = load_data()

In [None]:
births_df.head()

In [None]:
def make_1_vs_more_kids(df):
    df = df.copy()
    
    sum_more = np.array(df['10 i dalsze'])
    del df['10 i dalsze']
    
    for nb_children in range(2, 10):
        sum_more += np.array(df[str(nb_children)])
        del df[str(nb_children)]
    
    df['more_kids'] = sum_more
    df['one_kid'] = df['1']
    del df['1']
    df = df[['year', 'region', 'one_kid', 'more_kids']]
    return df


### TASK 1 - Compute pearson corellation

In [None]:
def compute_pearson(df):
    df = make_1_vs_more_kids(df)
    
    years = list(range(2002, 2016 + 1))
    regions = list(set(df['region']))

    # For each region 
    coeffs_by_region = []
    groupped_by_region = df.groupby('region')
    for region in regions:
        data = groupped_by_region.get_group(region)
        one_kid = data['one_kid']
        more_kids = data['more_kids']
        coeff = pearsonr(one_kid, more_kids)
        coeffs_by_region.append(np.round(coeff[0], 2))

    # For all regions
    coeff_all_regions = pearsonr(df['one_kid'], df['more_kids'])
    print('For all regions:', np.round(coeff_all_regions[0], 2))

    # Plotting
    fig = plt.figure(figsize=(15, 6))
    ax = fig.add_subplot(111)
    ax.set_title('Pearson 1 vs more kids for each region')
    g = sns.barplot(regions, coeffs_by_region, ax=ax)
    g.set_xticklabels(regions, rotation=45)

    i = 0
    for region, value in zip(regions, coeffs_by_region):
        g.text(i, value, value, ha='center')
        i += 1

    
compute_pearson(births_df)

### TASK 2 - Plot box plots of data

In [None]:
def plot_box_plots(df):
    df = make_1_vs_more_kids(df)
    
    years = list(range(2002, 2016 + 1))
    regions = list(set(df['region']))
    
    fig = plt.figure(figsize=(20, 15))

    # For each region
    kids_by_region = []
    
    groupped_by_region = df.groupby('region')
    for region in regions:
        data = groupped_by_region.get_group(region)
        kids = np.array(data['one_kid']) + np.array(data['more_kids'])
        kids_by_region.append(kids)

    ax = fig.add_subplot(221)
    g = sns.boxplot(x=regions, y=kids_by_region, ax=ax)
    g.set_xticklabels(regions, rotation=45)
    
    
    # For all regions
    all_region_kids = np.array(df['one_kid']) + np.array(df['more_kids'])
    
    ax2 = fig.add_subplot(222)
    sns.boxplot(x=['All regions'], y=[all_region_kids], ax=ax2)
    
    
    # Each year
    kids_by_year = []
    
    groupped_by_year = df.groupby('year')
    for year in years:
        data = groupped_by_year.get_group(year)
        kids = np.array(data['one_kid']) + np.array(data['more_kids'])
        kids_by_year.append(kids)
        
    ax3 = fig.add_subplot(223)
    sns.boxplot(x=years, y=kids_by_year, ax=ax3)
    
    
    # Before and after 500+
    ax4 = fig.add_subplot(224)
    sns.boxplot(x=['Before', 'After'], y=[np.mean(kids_by_year[:-1], axis=0), kids_by_year[-1]])
    
    
    
plot_box_plots(births_df)

### Additional plots

In [None]:
def make_additional_plots(df):
    years = list(range(2002, 2016 + 1))
    regions = list(set(df['region']))
    
    fig = plt.figure(figsize=(15, 20))
    sns.set_style("darkgrid")
    ax = fig.add_subplot(211)
    
    groupped_by_region = df.groupby('region')
    for region in regions:
        data = groupped_by_region.get_group(region)
        total_kids = data['total']
        ax.plot(years, total_kids, label=region, linestyle='--', marker='o')
        
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    
    ax2 = fig.add_subplot(212)
    
    nb_kids = [*[str(i) for i in range(1, 10)], '10 i dalsze']
    before = []
    after = []
    
    for col in nb_kids:
        before.append(sum(df[df.year < 2016][col]) / 14)
        after.append(sum(df[df.year == 2016][col]))
    
    ba_df = pd.DataFrame()
    
    b_df = pd.DataFrame()
    b_df['nb_kids'] = nb_kids
    b_df['count'] = before
    b_df['meaning'] = 'Before'
    
    a_df = pd.DataFrame()
    a_df['nb_kids'] = nb_kids
    a_df['count'] = after
    a_df['meaning'] = 'After'

    ba_df = ba_df.append(b_df)
    ba_df = ba_df.append(a_df)
    
    sns.barplot(x='nb_kids', y='count', hue='meaning', data=ba_df, ax=ax2)
    ax2.legend()
    
make_additional_plots(births_df)