## Main

In [6]:
import argparse


COUNTRIES = ['Belgium',
             'Lithuania',
             'Portugal',
             'Bulgaria',
             'Spain',
             'Luxembourg',
             'Romania',
             'Czechia',
             'France',
             'Hungary',
             'Slovenia',
             'Denmark',
             'Croatia',
             'Malta',
             'Slovakia',
             'Germany',
             'Italy',
             'Netherlands',
             'Finland',
             'Estonia',
             'Cyprus',
             'Austria',
             'Sweden',
             'Ireland',
             'Latvia',
             'Poland',
             'Great Britain',
             'Greece']


def argument_parser():
    parser = argparse.ArgumentParser(description='Obtain a full or country focused scope')

    # arguments here!
    parser.add_argument("-c", "--country", help="Create output for country X", type=str)
    parser.add_argument("-v", "--votes", help="Print table with In Favour/Against counts", type=str)

    args = parser.parse_args()

    return args


def main():
    print('Running pipeline...')

    # 1/3 - Export the Challenge 1 result to csv using parsed arguments

    if argument_parser().country in COUNTRIES:
        for c in COUNTRIES:
            table = export_challenge()
            table.loc[table['Country'] == argument_parser().country].to_csv(f'data/results/{argument_parser()}.csv',
                                                                            index=False)
            break
    elif argument_parser().country == 'All':
        table = export_challenge()
        table.to_csv('data/results/all.csv')

    else:
        raise ValueError("Please chose a valid country from the list, or alternatively, type 'All'")

    print('Table exported! Check your output folder ')

    # 2/3 - Print and export bonus 1 table to csv

    b1 = input('Would you also like an excel table with bonus 1? (yes/no)')
    if b1 == 'yes':
        bonus_1_table = export_bonus_1()
        print(bonus_1_table)
        bonus_1_table.to_csv('data/results/votes.csv')
        print('table has been exported, Check your folder!')
    elif b1 == 'no':
        print('Whatever... Your loss!')

    # 3/3 - Print and export bonus 2 table to csv

    b1 = input('How about top skills table for bonus 2? (yes/no)')
    if b1 == 'yes':
        bonus_2_table = export_bonus_2()
        bonus_2_table.to_csv('data/results/top_skills.csv')
        print(bonus_2_table)
        print('table has been exported, Check your folder!')
    elif b1 == 'no':
        print('Oh... what a waste of my time then')

    print('Script complete, thanks for using this awesome tool')


if __name__ == '__main__':
    arguments = argument_parser()
    main()


Unnamed: 0,country_code,normalized_job_title,gender
0,Austria,Unemployed,male
1,Austria,Unemployed,male
2,Austria,Unemployed,female
3,Austria,Unemployed,female
4,Austria,Unemployed,male
...,...,...,...
9644,Italy,data capture clerk,male
9645,Poland,data capture clerk,male
9646,Poland,data capture clerk,male
9647,Portugal,data capture clerk,male


## Acquisition

In [3]:
from sqlalchemy import create_engine
import pandas as pd
import requests

def acquire():

    # 1/3 Connection to the db

    sqlitedb_rel_path = '../data/processed/raw_data_project_m1.db'
    conn_str = f'sqlite:///{sqlitedb_rel_path}'
    engine = create_engine(conn_str)

    # Use dBeaver to cleanup a few fields i.e. gender, then make a DataFrame:

    sql_query = """ 
    SELECT country_info.uuid,
           country_info.country_code,
           career_info.dem_education_level,
           career_info.normalized_job_code, 
           personal_info.age, 
           personal_info.gender,
           poll_info.question_bbi_2016wave4_basicincome_vote,
           poll_info.question_bbi_2016wave4_basicincome_argumentsagainst,
       poll_info.question_bbi_2016wave4_basicincome_argumentsfor 
    FROM country_info
    JOIN career_info ON country_info.uuid = career_info.uuid
    JOIN personal_info ON personal_info.uuid = career_info.uuid
    JOIN poll_info ON poll_info.uuid = personal_info.uuid
    """

    poll_db = pd.read_sql_query(sql_query, engine)

    poll_job_code = poll_db['normalized_job_code'].unique().tolist()

    # 2/3 Extract job skills and description table using APIs
    # Make a list with job ids from the csv, and use it to get job names from website (open skills api)

    poll_db = pd.read_sql_query(sql_query, engine)

    poll_job_code = poll_db['normalized_job_code'].unique().tolist()

    url_list = []
    for i in poll_job_code:
        url_list.append(f'http://api.dataatwork.org/v1/jobs/{i}/related_skills')

    # Make a DataFrame with the collection of results.

    lst = []
    for url in url_list:
        response = requests.get(url)
        json_data = response.json()
        lst.append(json_data)

    api_skills = pd.DataFrame(lst)

    # Remove nulls

    api_skills.drop(index=api_skills[api_skills.skills.isnull()].index, inplace=True)

    #Formula to get the first skill, which is always the one with highest 'importance' value

    def get_max(x):
        top_skill = pd.json_normalize(x).iloc[0]['skill_name']
        return top_skill

    api_skills['top_skill'] = api_skills['skills'].apply(get_max)

    # 3/3 Get country names with web scraping

    url = 'https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes'
    html = requests.get(url).content

    # Make the soup

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Assign the correct [0] table data to variable tablle

    table = soup.find_all('table')[0]

    # Remove anything that isn't countries or acronyms

    items = [x.text for x in table.find_all('td')]

    # Create a dict of 2 lists (countries and their acronyms), and convert to DF

    countries = []
    acronyms = []
    for i in items:
        if i.startswith('('):
            acronyms.append(i[1:-2])
        else:
            countries.append(i[:-1])

    raw_dict = {'countries': countries, 'acronyms': acronyms}

    # This dict, however is not exactly the right shape for a latter pd.replace,

    raw_dict_df = pd.DataFrame(raw_dict)
    new_dict = raw_dict_df.set_index('acronyms').to_dict()
    countries_dict = new_dict['countries']

    # Adjust a couple keys missing in the countries table

    countries_dict['GB'] = 'Great Britain'
    countries_dict['GR'] = 'Greece'
    del countries_dict['EL']

    # From wrangling, include the lines to lead to a single table output
    # Merge poll file with api job list

    poll_api_merge = poll_db.merge(api_skills, left_on='normalized_job_code', right_on='job_uuid', how='outer')

    # Map countries from countries_dict

    poll_api_merge_countries = poll_api_merge.fillna('Unemployed').replace({"country_code": countries_dict})

    # Small correction to the education level column, where nulls were populated with Unemployed

    poll_api_merge_countries['dem_education_level'] = poll_api_merge_countries['dem_education_level'].str.replace(
        'Unemployed', 'no')

    return poll_api_merge_countries

In [4]:
acquired = acquire()
acquired

Unnamed: 0,uuid,country_code,dem_education_level,normalized_job_code,age,gender,question_bbi_2016wave4_basicincome_vote,question_bbi_2016wave4_basicincome_argumentsagainst,question_bbi_2016wave4_basicincome_argumentsfor,error,job_uuid,job_title,normalized_job_title,skills,top_skill
0,f6e7ee00-deac-0133-4de8-0a81e8b09a82,Austria,no,Unemployed,61 years old,male,I would not vote,None of the above,None of the above,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed
1,83127080-da3d-0133-c74f-0a81e8b09a82,Austria,no,Unemployed,32 years old,male,I would not vote,Foreigners might come to my country and take a...,It creates more equality of opportunity,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed
2,b50dbb80-da53-0133-8956-0a81e8b09a82,Austria,medium,Unemployed,26 years old,female,I would probably vote for it,Foreigners might come to my country and take a...,It reduces anxiety about financing basic needs,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed
3,9949c4c0-da5f-0133-c832-0a81e8b09a82,Austria,no,Unemployed,22 years old,female,I would vote for it,Foreigners might come to my country and take a...,"It increases solidarity, because it is funded ...",Unemployed,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed
4,69f1f400-dc5f-0133-ad9b-0a81e8b09a82,Austria,medium,Unemployed,58 years old,male,I would probably vote for it,Foreigners might come to my country and take a...,It increases appreciation for household work a...,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed,Unemployed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9644,1cb1aac0-d94c-0133-8baa-0a81e8b09a82,Italy,high,4cee16550636e292b8d136486fce943b,58 years old,male,I would probably vote for it,Only the people who need it most should get so...,It reduces anxiety about financing basic needs...,Unemployed,4cee16550636e292b8d136486fce943b,Data Capture Clerk,data capture clerk,[{'skill_uuid': '162cb662c7ef4dc9e86a2b5e88cbc...,clerical
9645,c8c33390-da69-0133-063a-0a81e8b09a82,Poland,medium,4cee16550636e292b8d136486fce943b,44 years old,male,I would vote for it,It might encourage people to stop working | It...,It reduces anxiety about financing basic needs,Unemployed,4cee16550636e292b8d136486fce943b,Data Capture Clerk,data capture clerk,[{'skill_uuid': '162cb662c7ef4dc9e86a2b5e88cbc...,clerical
9646,d27d24d0-d9b1-0133-03d4-0a81e8b09a82,Poland,high,4cee16550636e292b8d136486fce943b,24 years old,male,I would vote for it,None of the above,It reduces anxiety about financing basic needs...,Unemployed,4cee16550636e292b8d136486fce943b,Data Capture Clerk,data capture clerk,[{'skill_uuid': '162cb662c7ef4dc9e86a2b5e88cbc...,clerical
9647,529f3080-d99a-0133-1b7b-0a81e8b09a82,Portugal,no,4cee16550636e292b8d136486fce943b,40 years old,male,I would probably vote for it,It is impossible to finance,It encourages financial independence and self-...,Unemployed,4cee16550636e292b8d136486fce943b,Data Capture Clerk,data capture clerk,[{'skill_uuid': '162cb662c7ef4dc9e86a2b5e88cbc...,clerical


## Wrangling 

In [6]:
from p_acquisition.m_acquisition import acquire
aquired_result = acquire()


def challenge_1_table():


    # Remove unwanted columns

    columns = ['country_code', 'normalized_job_title', 'gender']
    final_raw = aquired_result[columns]

    # Remove unwanted columns

    columns = ['country_code', 'normalized_job_title', 'gender']
    final_raw = aquired_result[columns]

    # Rename headers

    col_dict = {'uuid_x': 'ID number',
                'country_code': 'Country',
                'gender': 'Gender',
                'normalized_job_title': 'Job Title'}
    final = final_raw.rename(columns=col_dict, inplace=False)

    # Group by country, job and gender, and add a column with respective count

    final_grouped = final.value_counts(['Country', 'Job Title', 'Gender']).reset_index(name='Quantity')

    return final_grouped


def bonus_1_table():

    # First, we create 2 tables, one with people who voted 'yes', and another with people who voted 'no'

    vote_yes = ['I would probably vote for it', 'I would vote for it']
    infavour = aquired_result[aquired_result['question_bbi_2016wave4_basicincome_vote'].isin(vote_yes)]

    vote_no = ['I would vote against it', 'I would probably vote against it']
    against = aquired_result[aquired_result['question_bbi_2016wave4_basicincome_vote'].isin(vote_no)]

    return infavour, against


def bonus_2_table():

    columns = ['dem_education_level', 'top_skill']
    final_raw = aquired_result[columns]

    return final_raw

Unnamed: 0,Country,Job Title,Gender,Quantity
0,France,computer or data processing systems consultant,male,13
1,Germany,oracle database administrator oracle dba,male,13
2,Germany,data architect,male,12
3,Germany,geographic information systems data specialist...,male,11
4,France,director of data operations,male,11
...,...,...,...,...
2298,Great Britain,data typist,male,1
2299,Poland,medical data analyst,female,1
2300,Poland,log data technician,male,1
2301,Poland,lidar data analyst,female,1


## Analysis

In [None]:
import pandas as pd

from p_wrangling.m_wrangling import challenge_1_table
from p_wrangling.m_wrangling import bonus_1_table
from p_wrangling.m_wrangling import bonus_2_table


def challenge_1_analysis():

    ch1_table = challenge_1_table()

    # Add a column percentage as ( quantity / total quantity )

    total_quantity = ch1_table ['Quantity'].sum()
    ch1_table ['Percentage'] = (((ch1_table ['Quantity'] / total_quantity) * 100).round(2)).astype(str) + '%'

    return ch1_table


def bonus_1_analysis():
    bonus_1 = bonus_1_table()

    # 1/4 - from those who voted 'yes', we count how many 'pro' arguments were given

    infavour_pro = bonus_1[0]['question_bbi_2016wave4_basicincome_argumentsfor'].unique()
    infavour_pro_a = [item for item in infavour_pro]
    infavour_pro_b = [i.split(' | ') for i in infavour_pro_a]
    infavour_pro_c = [i for sublist in infavour_pro_b for i in sublist]

    # 2/4 - from those who voted 'yes', we count how many 'con' arguments were given

    infavour_con = bonus_1[0]['question_bbi_2016wave4_basicincome_argumentsagainst'].unique()
    infavour_con_a = [item for item in infavour_con]
    infavour_con_b = [i.split(' | ') for i in infavour_con_a]
    infavour_con_c = [i for sublist in infavour_con_b for i in sublist]

    # 3/4 - from those who voted 'no', we count how many 'pro' arguments were given

    against_pro = bonus_1[1]['question_bbi_2016wave4_basicincome_argumentsfor'].unique()
    against_pro_a = [item for item in against_pro]
    against_pro_b = [i.split(' | ') for i in against_pro_a]
    against_pro_c = [i for sublist in against_pro_b for i in sublist]

    # 4/4 - from those who voted 'no', we count how many 'con' arguments were given

    against_con = bonus_1[1]['question_bbi_2016wave4_basicincome_argumentsagainst'].unique()
    against_con_a = [item for item in against_con]
    against_con_b = [i.split(' | ') for i in against_con_a]
    against_con_c = [i for sublist in against_con_b for i in sublist]

    # Now we feed the 4 counts obtained into a dataframe as per the bonus 1 instructions

    bonus_1 = {'Number of Pro Arguments': [len(infavour_pro_c), len(against_pro_c)],
               'Number of Con Arguments': [len(infavour_con_c), len(against_con_c)]}

    bonus_1_df = pd.DataFrame(bonus_1,
                              index=['In Favour', 'Against'],
                              columns=['Number of Pro Arguments', 'Number of Con Arguments'])

    return bonus_1_df


def bonus_2_analysis():
    bonus_2 = bonus_2_table()

    # Group the population into education-level buckets, and aggregate the skills for each into a list

    bonus_2_grouped = bonus_2.groupby('dem_education_level').agg(list).reset_index()

    # Count in each skill list the number of each skill, replace the list with a dictionary

    from collections import Counter

    def counter(x):
        return dict(Counter(x))

    bonus_2_grouped['top_skill'] = bonus_2_grouped['top_skill'].apply(counter)

    # Finally, replace each dict with the key (skill) that has the highest value (frequency)

    def get_tops(x):
        tops = sorted(x, key=x.get, reverse=True)[:10]
        return tops

    bonus_2_grouped['top_10_skills'] = bonus_2_grouped['top_skill'].apply(get_tops)

    columns = ['dem_education_level', 'top_10_skills']
    final = bonus_2_grouped[columns]

    return final

## Reporting

In [None]:
from p_analysis.m_analysis import challenge_1_analysis
from p_analysis.m_analysis import bonus_1_analysis
from p_analysis.m_analysis import bonus_2_analysis


def export_challenge():
    final_grouped = challenge_1_analysis()

    return final_grouped


def export_bonus_1():
    bonus_1_df = bonus_1_analysis()

    return bonus_1_df


def export_bonus_2():
    bonus_2_df = bonus_2_analysis()

    return bonus_2_df
