## Create a Spark session

In [14]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PYSPARK_PYTHON"] = "/home/pigidser/anaconda3/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"

In [15]:
master = "local"
spark = SparkSession.builder.master(master).appName("spark_test").getOrCreate()

## RAW --> STAGE

In [262]:
! rm data/stage/*.*

In [263]:
# transform json-files (capital, continent, currency, iso3, names, phone) to csv

import json
import csv
import subprocess

def transform_json_to_csv(file_name):
    with open("data/raw/"+file_name+".json","r") as fp:
        buf = fp.read()
    data_dict = json.loads(buf)
    
    with open("data/stage/"+file_name+".csv", 'w') as f:
        for key in data_dict.keys():
            f.write("%s,%s\n"%(key,data_dict[key]))

files_to_process = ['capital','continent','currency','iso3','names', 'phone']
for file in files_to_process:
    transform_json_to_csv(file)

In [307]:
# countries_of_the_world.csv consists of comma in number as well as delimited character --> replace a comma with a point
# also to remove multy-spaces

import re

with open("data/raw/countries_of_the_world.csv","r") as f_input:
    with open('data/stage/country.csv', 'w') as f_output:
        for result in f_input:
            result = re.sub(r'"(\d+),(\d+)"', r'\1.\2', result) # find a comma between digits in quotes and replace to a point
            result = re.sub(r'\s+', r' ', result) # replace multi-space to single space
            result = re.sub(r'\s,', r',', result)
            result = re.sub(r'\s+",', r'",', result)
            result = re.sub(r'"\s+', r'"', result)
            f_output.write(result+"\n")
f_input.close
f_output.close

with open("data/raw/nobel_laureates.csv","r") as f_input:
    with open('data/stage/nobel_laureates.csv', 'w') as f_output:
        for result in f_input:
            result = re.sub(r'[\"]+','"',result)
            result = re.sub(r'\s+', r' ', result)
            result = re.sub(r'\s,', r',', result)
            f_output.write(result+"\n")
f_input.close
f_output.close

<function TextIOWrapper.close()>

In [265]:
# Copy to the stage folder
! cp data/raw/worldcitiespop.csv data/stage/

Now we have next csv-files in the stage folder:
- capital
- continent
- currency
- iso3
- names
- phone
- countries
- nobel_laureates
- worldcitiespop

## Load data to Spark

In [438]:
capital = spark.read.csv("data/stage/capital.csv",schema="code string, capital string",header=False)
continent = spark.read.csv("data/stage/continent.csv",schema="code string, continent string",header=False)
currency = spark.read.csv("data/stage/currency.csv",schema="code string, currency string",header=False)
iso3 = spark.read.csv("data/stage/iso3.csv",schema="code string, iso3 string",header=False)
names = spark.read.csv("data/stage/names.csv",schema="code string, name string",header=False)
phone = spark.read.csv("data/stage/phone.csv",schema="code string, phone string",header=False)
countries = spark.read.csv("data/stage/country.csv",header=True)
laureates = spark.read.csv("data/stage/nobel_laureates.csv",header=True)
worldcitiespop = spark.read.csv("data/stage/worldcitiespop.csv",header=True)

In [439]:
import re

def standartize_column_name(rename_df):
    for column in rename_df.columns:
        new_column = re.sub(r'[\.\s\(\)\/\$\%]+','_',column.lower().strip())
        new_column = re.sub(r'[\_]+','_',new_column)
        new_column = re.sub(r'(\S+)_\b', r'\1', new_column) # if name like abcdefgh_ remove "_"
        rename_df = rename_df.withColumnRenamed(column, new_column)
    return rename_df

In [None]:
country_properties = standartize_column_name(country_properties)
countries = standartize_column_name(countries)
laureates = standartize_column_name(laureates)
worldcitiespop = standartize_column_name(worldcitiespop)

## Data Mart preparation

In [441]:
# joining by a key field (with the same name) to get only one key field in the result dataframe
country_properties = capital.join(continent, 'code')\
    .join(currency, 'code')\
    .join(iso3, 'code')\
    .join(names, 'code')\
    .join(phone, 'code')

countries = countries\
    .join(country_properties.select('code','capital','continent','currency','iso3','name','phone'),countries['country']==country_properties['name'],'left')\
    .drop("name")

nobel_laureates = laureates.select('laureate_id','category','laureate_type','full_name','birth_date',
    'birth_city','birth_country','sex','organization_name','organization_city','organization_country',
    'death_date','death_city','death_country')

nobel_prizes = laureates.select("year","prize","motivation","prize_share","laureate_id")

cities = worldcitiespop.drop("region")

nobel_prizes.show(1)
nobel_laureates.show(1)
countries.show(1)
cities.show(1)

## Dataset preparation

In [None]:
nobel_prizes_dataset = nobel_prizes.join(nobel_laureates, 'laureate_id')
countries_dataset = countries

nobel_prizes_dataset.show(1)
countries_dataset.show(1)

In [461]:
countries.show(1)


+-------+------------------+----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+--------+---------------+------+-----+-----+-------+---------+---------+-----------+--------+-------+----+---------+---------+--------+----+-----+
|country|            region|population|area_sq_mi|pop_density_per_sq_mi|coastline_coast_area_ratio|net_migration|infant_mortality_per_1000_births|gdp_per_capita|literacy|phones_per_1000|arable|crops|other|climate|birthrate|deathrate|agriculture|industry|service|code|  capital|continent|currency|iso3|phone|
+-------+------------------+----------+----------+---------------------+--------------------------+-------------+--------------------------------+--------------+--------+---------------+------+-----+-----+-------+---------+---------+-----------+--------+-------+----+---------+---------+--------+----+-----+
|   Chad|SUB-SAHARAN AFRICA|   9944201|   1284000|                  7.7|    