<a href="https://colab.research.google.com/github/nhatduong01/SQL_on_BigQuery/blob/main/Basis/World_Bank_International_Education.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Working with `international_education` table on the `world_bank_intl_education` dataset

In [1]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
# Set up the Project Environment
import os
os.environ["GCLOUD_PROJECT"] = "BigQuerySQLExercises"

In [4]:
from google.cloud import bigquery
import pandas as pd

client = bigquery.Client()

data_ref = client.dataset(dataset_id= 'world_bank_intl_education',
                          project = "bigquery-public-data")
dataset = client.get_dataset(data_ref)
tables = list(client.list_tables(dataset))

for table in tables:
  print(table.table_id)

country_series_definitions
country_summary
international_education
series_summary


In [5]:
# Fetch the table
table_ref = data_ref.table('international_education')

table = client.get_table(table_ref)
# Print table schema
table.schema

[SchemaField('country_name', 'STRING', 'NULLABLE', '', ()),
 SchemaField('country_code', 'STRING', 'NULLABLE', '', ()),
 SchemaField('indicator_name', 'STRING', 'NULLABLE', '', ()),
 SchemaField('indicator_code', 'STRING', 'NULLABLE', '', ()),
 SchemaField('value', 'FLOAT', 'NULLABLE', '', ()),
 SchemaField('year', 'INTEGER', 'NULLABLE', '', ())]

In [None]:
#List the first 10 rows
client.list_rows(table, max_results= 10).to_dataframe()

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Djibouti,DJI,"Enrolment in secondary vocational, female (num...",SE.SEC.ENRL.VO.FE,2217.0,2016
1,Djibouti,DJI,"Gross graduation ratio from primary education,...",SE.PRM.CMPL.ZS,46.081841,2016
2,Dominica,DMA,Population of the official age for the last gr...,SP.PRM.GRAD.MA,470.0,2016
3,Pakistan,PAK,Internet users (per 100 people),IT.NET.USER.P2,15.514558,2016
4,Liberia,LBR,SABER: (Tertiary Education) Policy Goal 4: Fin...,SABER.TER.GOAL4,1.0,2017
5,Libya,LBY,Population growth (annual %),SP.POP.GROW,0.930675,2016
6,Suriname,SUR,Population of the official age for pre-primary...,SP.PRE.TOTL.FE.IN,9249.0,2016
7,Suriname,SUR,SABER: (Education Management Information Syste...,SABER.EMIS.GOAL1.LVL2,2.0,2016
8,Tajikistan,TJK,"Enrolment in primary education, female (number)",SE.PRM.ENRL.FE,346823.0,2016
9,Tajikistan,TJK,GDP per capita (constant 2005 US$),NY.GDP.PCAP.KD,968.385603,2016


### 1) Goverment Expenditures on Education
Which countries spend the largest fraction of GDP on education?
`SE.XPD.TOTL.GD.ZS` indicator in the table specifies the amount of GDP goverment spent on education. We need a query to return the `AVG()` of GDP spent of each country from 2010 -2017.

In [6]:
query = """
        SELECT country_name,AVG(value) AS Amount
        FROM `bigquery-public-data.world_bank_intl_education.international_education`
        WHERE year <= 2017 AND YEAR >= 2010 AND indicator_code = 'SE.XPD.TOTL.GD.ZS'
        GROUP BY country_name
        ORDER BY Amount DESC
        """
query_job = client.query(query= query, project= 'bigquerysqlexercises')

GDP_expen = query_job.to_dataframe()
print(GDP_expen)

                 country_name     Amount
0                        Cuba  12.837270
1       Micronesia, Fed. Sts.  12.467750
2             Solomon Islands  10.001080
3                     Moldova   8.372153
4                     Namibia   8.349610
..                        ...        ...
152                  Cambodia   1.706404
153        West Bank and Gaza   1.503760
154               South Sudan   1.409726
155                    Monaco   1.409606
156  Central African Republic   1.214010

[157 rows x 2 columns]


In [None]:
# Lets see where VietNam is
GDP_expen[GDP_expen.country_name == 'Vietnam']

Unnamed: 0,country_name,Amount
49,Vietnam,5.282447


Collect all the information about Vietnam in the data

In [None]:
query2 = """
         SELECT *
         FROM `bigquery-public-data.world_bank_intl_education.international_education`
         WHERE country_name = 'Vietnam'"""
query_job2 = client.query(query= query2, project = 'bigquerysqlexercises')
Vietnam_info = query_job2.to_dataframe()

In [15]:
Vietnam_info

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Vietnam,VNM,Official entrance age to pre-primary education...,UIS.THAGE.0,3.000000e+00,2016
1,Vietnam,VNM,"GDP, PPP (constant 2011 international $)",NY.GDP.MKTP.PP.KD,5.520591e+11,2016
2,Vietnam,VNM,"Population, ages 0-14, male",SP.POP.0014.MA.IN,1.117732e+07,2016
3,Vietnam,VNM,Population of the official entrance age to sec...,UIS.SAP.23.GPV.G1.F,6.437930e+05,2016
4,Vietnam,VNM,GNI (current US$),NY.GNP.MKTP.CD,1.969146e+11,2016
...,...,...,...,...,...,...
23300,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.1519.2.FE,4.200000e-01,2045
23301,Vietnam,VNM,Wittgenstein Projection: Population age 15-19 ...,PRJ.POP.1519.4.MF,1.191000e+01,2045
23302,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.2064.NED.MF,2.000000e-02,2045
23303,Vietnam,VNM,Wittgenstein Projection: Population age 25-29 ...,PRJ.POP.2529.NED.MA,1.484000e+01,2045


### 2) Most interesting code
Return the most used `indicator_code` of the datasetin 2016.  
_Codes have been used over 200 times._

In [13]:
query3 = """
         SELECT indicator_name, indicator_code, COUNT(indicator_name) AS Times_Used
         FROM `bigquery-public-data.world_bank_intl_education.international_education`
         WHERE year = 2016
         GROUP BY indicator_name, indicator_code
         HAVING Times_Used >= 200
         ORDER BY Times_Used DESC """
query_job3 = client.query(query = query3, project= 'bigquerysqlexercises')

most_used = query_job3.to_dataframe()
most_used

Unnamed: 0,indicator_name,indicator_code,Times_Used
0,"Population, total",SP.POP.TOTL,232
1,Population growth (annual %),SP.POP.GROW,232
2,Internet users (per 100 people),IT.NET.USER.P2,223
3,"Population, ages 0-14 (% of total)",SP.POP.0014.TO.ZS,213
4,"Population, male",SP.POP.TOTL.MA.IN,213
5,"Population, ages 15-64 (% of total)",SP.POP.1564.TO.ZS,213
6,"Population, ages 15-64, total",SP.POP.1564.TO,213
7,"Population, female",SP.POP.TOTL.FE.IN,213
8,"Population, ages 15-64, male",SP.POP.1564.MA.IN,213
9,"Mortality rate, under-5 (per 1,000)",SH.DYN.MORT,213


In [19]:
# Some data about Vietnam
Vietnam_info.loc[Vietnam_info.year == 2020]

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
10829,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.1519.1.MA,0.18,2020
10830,Vietnam,VNM,Wittgenstein Projection: Population age 15-19 ...,PRJ.POP.1519.4.MF,10.23,2020
10831,Vietnam,VNM,Wittgenstein Projection: Population age 20-24 ...,PRJ.POP.2024.4.FE,354.13,2020
10832,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.60UP.1.MF,0.27,2020
10833,Vietnam,VNM,Wittgenstein Projection: Population age 20-24 ...,PRJ.POP.2024.NED.FE,76.71,2020
...,...,...,...,...,...,...
22573,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.2039.3.MF,0.21,2020
22574,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.25UP.NED.FE,0.06,2020
22575,Vietnam,VNM,Wittgenstein Projection: Percentage of the pop...,PRJ.ATT.4064.2.MA,0.34,2020
22576,Vietnam,VNM,Wittgenstein Projection: Population age 20-24 ...,PRJ.POP.2024.NED.MF,131.00,2020
