## Main

In [6]:
import argparse
from p_reporting.m_reporting import export_csv

COUNTRIES = ['Belgium',
             'Lithuania',
             'Portugal',
             'Bulgaria',
             'Spain',
             'Luxembourg',
             'Romania',
             'Czechia',
             'France',
             'Hungary',
             'Slovenia',
             'Denmark',
             'Croatia',
             'Malta',
             'Slovakia',
             'Germany',
             'Italy',
             'Netherlands',
             'Finland',
             'Estonia',
             'Cyprus',
             'Austria',
             'Sweden',
             'Ireland',
             'Latvia',
             'Poland',
             'Great Britain',
             'Greece']


def argument_parser():
    parser = argparse.ArgumentParser(description='Obtain a full or country focused scope')

    # arguments here!
    parser.add_argument("-c", "--country", help="Create output for country X", type=str)

    args = parser.parse_args()

    return args


def main(arguments):
    print('Running pipeline...')

    table = export_csv()


    for c in COUNTRIES:
        if argument_parser().country in COUNTRIES:
            table.loc[table['Country'] == argument_parser().country].to_csv(f'data/results/{argument_parser()}.csv', index=False)
            break
        elif argument_parser().country == 'All':
            table.to_csv('data/results/all.csv')
            break
        else:
            raise ValueError("Please chose a valid country from the list, or alternatively, type 'All'")

    print('Pipeline complete! Check your output folder ')


if __name__ == '__main__':
    arguments = argument_parser()
    main(arguments)


Unnamed: 0,country_code,normalized_job_title,gender
0,Austria,Unemployed,male
1,Austria,Unemployed,male
2,Austria,Unemployed,female
3,Austria,Unemployed,female
4,Austria,Unemployed,male
...,...,...,...
9644,Italy,data capture clerk,male
9645,Poland,data capture clerk,male
9646,Poland,data capture clerk,male
9647,Portugal,data capture clerk,male


## Acquisition

In [7]:
def acquire():

# 1/3 Connection to the db
    from sqlalchemy import create_engine
    import pandas as pd
    import requests

    sqlitedb_rel_path = '../data/processed/raw_data_project_m1.db'
    conn_str = f'sqlite:///{sqlitedb_rel_path}'
    engine = create_engine(conn_str)

# Use dBeaver to cleanup a few fields i.e. gender, then make a DataFrame:

    sql_query = """ 
    SELECT country_info.uuid,
           country_info.country_code,
           career_info.dem_education_level,
           career_info.normalized_job_code, 
           personal_info.age, 
           personal_info.gender,
           poll_info.question_bbi_2016wave4_basicincome_vote,
           poll_info.question_bbi_2016wave4_basicincome_argumentsagainst,
           poll_info.question_bbi_2016wave4_basicincome_argumentsfor 
    FROM country_info
    JOIN career_info ON country_info.uuid = career_info.uuid
    JOIN personal_info ON personal_info.uuid = career_info.uuid
    JOIN poll_info ON poll_info.uuid = personal_info.uuid
    """

    poll_db = pd.read_sql_query(sql_query, engine)

    poll_job_code = poll_db['normalized_job_code'].unique().tolist()

# 2/3 Extraction of the job description table using APIs
# Make a list with job ids from the csv, and use it to get job names from website (open skills api)
#############
    poll_db = pd.read_sql_query(sql_query, engine)

    poll_job_code = poll_db['normalized_job_code'].unique().tolist()

    url_list = []
    for i in poll_job_code:
        url_list.append(f'http://api.dataatwork.org/v1/jobs/{i}/related_skills')

    # Make a DataFrame with the collection of results.

    lst = []
    for url in url_list:
        response = requests.get(url)
        json_data = response.json()
        lst.append(json_data)

    api_skills = pd.DataFrame(lst)

    # Remove nulls 

    api_skills.drop(index=api_skills[api_skills.skills.isnull()].index, inplace=True)

    #Formula to get the first skill, which is always the one with highest 'importance' value

    def get_max(x):
        top_skill = pd.json_normalize(x).iloc[0]['skill_name']
        return top_skill

    api_skills['top_skill'] = api_skills['skills'].apply(get_max)
    
##################
# 3/3 Get the links to scrape from

    url = 'https://ec.europa.eu/eurostat/statistics-explained/index.php/Glossary:Country_codes'
    html = requests.get(url).content

# Make the soup

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

# Assign the correct [0] table data to variable tablle

    table = soup.find_all('table')[0]

# Remove anything that isn't countries or acronyms

    items = [x.text for x in table.find_all('td')]

# Create a dicto of 2 lists (countries and their acronyms), and convert to DF

    countries = []
    acronyms = []
    for i in items:
        if i.startswith('('):
            acronyms.append(i[1:-2])
        else:
            countries.append(i[:-1])

    raw_dict = {'countries': countries, 'acronyms': acronyms}

# This dict, however is not exactly the right shape for a latter pd.replace,

    raw_dict_df = pd.DataFrame(raw_dict)
    new_dict = raw_dict_df.set_index('acronyms').to_dict()
    countries_dict = new_dict['countries']

# Adjust a couple keys missing in the countries table

    countries_dict['GB'] = 'Great Britain'
    countries_dict['GR'] = 'Greece'
    del countries_dict['EL']

# From wrangling, include the lines to lead to a single table output
# Merge poll file with api job list

    poll_api_merge = poll_db.merge(api_skills, left_on='normalized_job_code', right_on='job_uuid')

# Map countries from countries_dict

    poll_api_merge_countries = poll_api_merge.fillna('Unemployed').replace({"country_code": countries_dict})



    return poll_api_merge_countries


In [8]:
table = acquire()
table

ValueError: Wrong number of items passed 9, placement implies 1

## Wrangling 

In [6]:
def make_table():

    # Bring in the raw table from m_acquisition

#     from p_acquisition.m_acquisition import acquire
    table
    
    # Remove unwanted columns

    columns = ['country_code', 'normalized_job_title', 'gender']
    final_raw = table[columns]


# Rename headers

    col_dict = {'uuid_x': 'ID number', 'country_code': 'Country', 'gender': 'Gender','normalized_job_title': 'Job Title'}
    final = table.rename(columns = col_dict, inplace = False)

# Group by country, job and gender, and add a column with respective count

    final_grouped = final.value_counts(['Country', 'Job Title', 'Gender']).reset_index(name='Quantity')


    return final_grouped

one = make_table()
one

Unnamed: 0,Country,Job Title,Gender,Quantity
0,France,computer or data processing systems consultant,male,13
1,Germany,oracle database administrator oracle dba,male,13
2,Germany,data architect,male,12
3,Germany,geographic information systems data specialist...,male,11
4,France,director of data operations,male,11
...,...,...,...,...
2298,Great Britain,data typist,male,1
2299,Poland,medical data analyst,female,1
2300,Poland,log data technician,male,1
2301,Poland,lidar data analyst,female,1


In [25]:
def skills_table(): 
    columns = ['dem_education_level', 'top_skill']
    final_raw = table[columns]
    return  final_raw 

aaa = skills_table()
aaa

Unnamed: 0,dem_education_level,top_skill
0,high,computers and electronics
1,high,computers and electronics
2,high,computers and electronics
3,medium,computers and electronics
4,high,computers and electronics
...,...,...
5105,high,clerical
5106,medium,clerical
5107,high,clerical
5108,no,clerical


In [40]:
bbb = aaa.groupby('dem_education_level').agg(list).reset_index()

In [41]:
from collections import Counter

def counter(x):
    return Counter(x)

bbb['top_skill'] = bbb['top_skill'].apply(counter)

bbb





Unnamed: 0,dem_education_level,top_skill
0,Unemployed,"{'computers and electronics': 174, 'informatio..."
1,high,"{'computers and electronics': 1659, 'informati..."
2,low,"{'computers and electronics': 513, 'informatio..."
3,medium,"{'computers and electronics': 1458, 'informati..."
4,no,"{'information ordering': 3, 'computers and ele..."


In [46]:

counter(bbb.iloc[3][1])

Counter({'computers and electronics': 1458,
         'information ordering': 72,
         'clerical': 70,
         'geography': 119,
         'mathematical reasoning': 35,
         'english language': 16,
         'economics and accounting': 22,
         'production and processing': 11,
         'active listening': 2,
         'critical thinking': 27,
         'engineering and technology': 30,
         'reading comprehension': 4,
         'physics': 12,
         'administration and management': 12,
         'near vision': 6,
         'mechanical': 3,
         'customer and personal service': 4,
         'biology': 2,
         'telecommunications': 7})

In [39]:
def get_top(x): 
    result = max(x, key=x.get)
    return result

bbb['top_skill'] = bbb['top_skill'].apply(get_top)

bbb

Unnamed: 0,dem_education_level,top_skill
0,Unemployed,computers and electronics
1,high,computers and electronics
2,low,computers and electronics
3,medium,computers and electronics
4,no,computers and electronics


## Analysis

In [None]:
def analyse():

# Bring in the final_grouped table from m_wrangling

    from p_wrangling.m_wrangling import make_table
    final_grouped = make_table()

# Add a column percentage as ( quantity / total quantity )

    total_quantity = final_grouped['Quantity'].sum()
    final_grouped['Percentage'] = (((final_grouped['Quantity'] / total_quantity) * 100).round(2)).astype(str) + '%'

    return final_grouped

## Reporting

In [None]:

def export_csv():

# Bring in the final_grouped table from m_analysis

    from p_analysis.m_analysis import analyse

    final_grouped = analyse()

    return final_grouped