In [1]:
import findspark
findspark.init("/home/ubuntu/spark-2.1.1-bin-hadoop2.7")
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS-Linda').getOrCreate()

from functools import reduce
from pyspark.sql import DataFrame


In [3]:
# import Maori language in education datasets
filename1 = "./Maori-Language-Learning-Student-Numbers-by-Ethnicity-2004-2008.csv"
filename2 = "./Maori-Language-Learning-Student-Numbers-by-Ethnicity-2009-2013.csv"
filename3 = "./Maori-Language-Learning-Student-Numbers-by-Ethnicity-2014-2018.csv"
filename4 = "./Maori-Language-Learning-Student-Numbers-by-Ethnicity-2019.csv"
MLLSN1=spark.read.csv(filename1, inferSchema=True, header="true")
MLLSN2=spark.read.csv(filename2, inferSchema=True, header="true")
MLLSN3=spark.read.csv(filename3, inferSchema=True, header="true")
MLLSN4=spark.read.csv(filename4, inferSchema=True, header="true")

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

MLLSN=unionAll(MLLSN1,MLLSN2,MLLSN3,MLLSN4)
MLLSN.columns

['Students Number',
 'Year: As at 1 July',
 'Student: Ethnicity',
 'Student: Year level',
 'Student: Year level (Grouped)',
 'Student: Maori Language Immersion Level',
 'Student: Maori Language in Education level',
 'School: ID',
 'School: Maori Language Descriptor',
 'School: Medium',
 'School: Highest Maori Language Immersion Level',
 'School: Name',
 'School: Sector',
 'School: Type',
 'School: Authority',
 'School: Decile',
 'School: Gender',
 'School: Affiliation Type',
 'School: Affiliation',
 'School: Definition',
 'School: Kura Type',
 'Region: Education Region',
 'Region: Regional Council',
 'Region: Territorial Authority',
 'Region: TA with Auckland wards',
 'Region: TA with Auckland local boards',
 'Region: MOE Local office',
 'Region: Census area unit',
 'Region: Ward']

In [4]:
# import population data
filename = "./Subnational-ethnic-population-projections-2013base.csv"
POP=spark.read.csv(filename, inferSchema=True, header="true")
POP.columns

['TA with Auckland Local Board',
 'Year at 30 June',
 'Euro-all-ages',
 'Euro-5-9',
 'Euro-10-14',
 'Euro-15-19',
 'Maori-all-ages',
 'Maori-5-9',
 'Maori-10-14',
 'Maori-15-19',
 'Asian-all-ages',
 'Asian-5-9',
 'Asian-10-14',
 'Asian-15-19',
 'Pacific-all-ages',
 'Pacific-5-9',
 'Pacific-10-14',
 'Pacific-15-19',
 'Total-all-ages',
 'Total-5-9',
 'Total-10-14',
 'Total-15-19']

In [5]:
MLLSN.count()

624768

In [6]:
MLLSN.printSchema()

root
 |-- Students Number: integer (nullable = true)
 |-- Year: As at 1 July: integer (nullable = true)
 |-- Student: Ethnicity: string (nullable = true)
 |-- Student: Year level: string (nullable = true)
 |-- Student: Year level (Grouped): string (nullable = true)
 |-- Student: Maori Language Immersion Level: string (nullable = true)
 |-- Student: Maori Language in Education level: string (nullable = true)
 |-- School: ID: integer (nullable = true)
 |-- School: Maori Language Descriptor: string (nullable = true)
 |-- School: Medium: string (nullable = true)
 |-- School: Highest Maori Language Immersion Level: string (nullable = true)
 |-- School: Name: string (nullable = true)
 |-- School: Sector: string (nullable = true)
 |-- School: Type: string (nullable = true)
 |-- School: Authority: string (nullable = true)
 |-- School: Decile: string (nullable = true)
 |-- School: Gender: string (nullable = true)
 |-- School: Affiliation Type: string (nullable = true)
 |-- School: Affiliation: 

In [7]:
POP.count()

528

In [8]:
POP.printSchema()

root
 |-- TA with Auckland Local Board: string (nullable = true)
 |-- Year at 30 June: integer (nullable = true)
 |-- Euro-all-ages: integer (nullable = true)
 |-- Euro-5-9: integer (nullable = true)
 |-- Euro-10-14: integer (nullable = true)
 |-- Euro-15-19: integer (nullable = true)
 |-- Maori-all-ages: integer (nullable = true)
 |-- Maori-5-9: integer (nullable = true)
 |-- Maori-10-14: integer (nullable = true)
 |-- Maori-15-19: integer (nullable = true)
 |-- Asian-all-ages: integer (nullable = true)
 |-- Asian-5-9: integer (nullable = true)
 |-- Asian-10-14: integer (nullable = true)
 |-- Asian-15-19: integer (nullable = true)
 |-- Pacific-all-ages: integer (nullable = true)
 |-- Pacific-5-9: integer (nullable = true)
 |-- Pacific-10-14: integer (nullable = true)
 |-- Pacific-15-19: integer (nullable = true)
 |-- Total-all-ages: integer (nullable = true)
 |-- Total-5-9: integer (nullable = true)
 |-- Total-10-14: integer (nullable = true)
 |-- Total-15-19: integer (nullable = true

In [9]:
MLLSN.show()

+---------------+------------------+------------------+-------------------+-----------------------------+---------------------------------------+------------------------------------------+----------+---------------------------------+--------------------+----------------------------------------------+--------------------+--------------+--------------------+--------------------+--------------+--------------+------------------------+-------------------+------------------+-----------------+------------------------+------------------------+-----------------------------+------------------------------+-------------------------------------+------------------------+------------------------+--------------------+
|Students Number|Year: As at 1 July|Student: Ethnicity|Student: Year level|Student: Year level (Grouped)|Student: Maori Language Immersion Level|Student: Maori Language in Education level|School: ID|School: Maori Language Descriptor|      School: Medium|School: Highest Maori Language Imm

In [10]:
#check unique values in the key attributes
MLLSN.select("Student: Ethnicity").distinct().show()
MLLSN.select("Student: Year level (Grouped)").distinct().show()

+------------------+
|Student: Ethnicity|
+------------------+
|             Maori|
|         Non Maori|
+------------------+

+-----------------------------+
|Student: Year level (Grouped)|
+-----------------------------+
|          Secondary (Year 9+)|
|           Primary (Year 1-8)|
+-----------------------------+



In [11]:
MLLSN.select("Student: Maori Language Immersion Level").distinct().show()
MLLSN.select("Student: Maori Language in Education level").distinct().show()

+---------------------------------------+
|Student: Maori Language Immersion Level|
+---------------------------------------+
|                   Level 4(b): At le...|
|                   Level 4(a): up to...|
|                   Level 5: Less tha...|
|                        Level 3: 31-50%|
|                   No Maori language...|
|                       Level 1: 81-100%|
|                    Level 6: Taha Maori|
|                        Level 2: 51-80%|
+---------------------------------------+

+------------------------------------------+
|Student: Maori Language in Education level|
+------------------------------------------+
|                      Maori Language in...|
|                      No Maori language...|
|                              Maori medium|
+------------------------------------------+



In [12]:
MLLSN.select('School: Type').distinct().show()
MLLSN.select('School: Sector').distinct().show()

+--------------------+
|        School: Type|
+--------------------+
|Kura Teina - Primary|
|Restricted Compos...|
|      Special School|
|Correspondence Sc...|
|        Full Primary|
|        Intermediate|
|    Teen Parent Unit|
|Secondary (Year 9...|
|Kura Teina - Comp...|
|        Contributing|
|Secondary (Year 7...|
|Composite (Year 1...|
+--------------------+

+--------------+
|School: Sector|
+--------------+
|     Secondary|
|       Special|
|     Composite|
|       Primary|
+--------------+



In [13]:
POP.show()

+----------------------------+---------------+-------------+--------+----------+----------+--------------+---------+-----------+-----------+--------------+---------+-----------+-----------+----------------+-----------+-------------+-------------+--------------+---------+-----------+-----------+
|TA with Auckland Local Board|Year at 30 June|Euro-all-ages|Euro-5-9|Euro-10-14|Euro-15-19|Maori-all-ages|Maori-5-9|Maori-10-14|Maori-15-19|Asian-all-ages|Asian-5-9|Asian-10-14|Asian-15-19|Pacific-all-ages|Pacific-5-9|Pacific-10-14|Pacific-15-19|Total-all-ages|Total-5-9|Total-10-14|Total-15-19|
+----------------------------+---------------+-------------+--------+----------+----------+--------------+---------+-----------+-----------+--------------+---------+-----------+-----------+----------------+-----------+-------------+-------------+--------------+---------+-----------+-----------+
|          Far North district|           2013|        40800|    2750|      2770|      2420|         27100|     2

In [14]:
POP.select('Year at 30 June').distinct().show()

+---------------+
|Year at 30 June|
+---------------+
|           2018|
|           2023|
|           2013|
|           2038|
|           2033|
|           2028|
+---------------+



In [15]:
from pyspark.sql.functions import isnan, when, count, col

MLLSN.select([count(when(isnan(c),c)).alias(c) for c in MLLSN.columns]).show()

+---------------+------------------+------------------+-------------------+-----------------------------+---------------------------------------+------------------------------------------+----------+---------------------------------+--------------+----------------------------------------------+------------+--------------+------------+-----------------+--------------+--------------+------------------------+-------------------+------------------+-----------------+------------------------+------------------------+-----------------------------+------------------------------+-------------------------------------+------------------------+------------------------+------------+
|Students Number|Year: As at 1 July|Student: Ethnicity|Student: Year level|Student: Year level (Grouped)|Student: Maori Language Immersion Level|Student: Maori Language in Education level|School: ID|School: Maori Language Descriptor|School: Medium|School: Highest Maori Language Immersion Level|School: Name|School: Secto

In [16]:
MLLSN.filter(MLLSN['Students Number']<=0).show()

+---------------+------------------+------------------+-------------------+-----------------------------+---------------------------------------+------------------------------------------+----------+---------------------------------+--------------------+----------------------------------------------+--------------------+--------------+--------------------+--------------------+--------------+---------------+------------------------+-------------------+--------------------+--------------------+------------------------+------------------------+-----------------------------+------------------------------+-------------------------------------+------------------------+------------------------+--------------------+
|Students Number|Year: As at 1 July|Student: Ethnicity|Student: Year level|Student: Year level (Grouped)|Student: Maori Language Immersion Level|Student: Maori Language in Education level|School: ID|School: Maori Language Descriptor|      School: Medium|School: Highest Maori Langua

In [17]:
POP.select([count(when(isnan(c),c)).alias(c) for c in POP.columns]).show()

+----------------------------+---------------+-------------+--------+----------+----------+--------------+---------+-----------+-----------+--------------+---------+-----------+-----------+----------------+-----------+-------------+-------------+--------------+---------+-----------+-----------+
|TA with Auckland Local Board|Year at 30 June|Euro-all-ages|Euro-5-9|Euro-10-14|Euro-15-19|Maori-all-ages|Maori-5-9|Maori-10-14|Maori-15-19|Asian-all-ages|Asian-5-9|Asian-10-14|Asian-15-19|Pacific-all-ages|Pacific-5-9|Pacific-10-14|Pacific-15-19|Total-all-ages|Total-5-9|Total-10-14|Total-15-19|
+----------------------------+---------------+-------------+--------+----------+----------+--------------+---------+-----------+-----------+--------------+---------+-----------+-----------+----------------+-----------+-------------+-------------+--------------+---------+-----------+-----------+
|                           0|              0|            0|       0|         0|         0|             0|      

In [18]:
# exclude irrelevant attributes in Māori language data 
cols=MLLSN.columns
cols.remove('School: Name')
cols.remove('School: Gender')
cols.remove('School: Affiliation Type')
cols.remove('School: Affiliation')
cols.remove('School: Definition')
cols.remove('Region: Census area unit')

In [19]:
# exclude related attributes in Māori language data
cols.remove('Student: Year level')
cols.remove('School: Type')
cols.remove('Region: Regional Council')
cols.remove('Region: Education Region')
cols.remove('Region: Territorial Authority')
cols.remove('Region: Ward')
cols.remove('Region: TA with Auckland wards')
cols.remove('Region: MOE Local office')
MLLSN2=MLLSN[cols]

In [20]:
MLLSN2.dtypes

[('Students Number', 'int'),
 ('Year: As at 1 July', 'int'),
 ('Student: Ethnicity', 'string'),
 ('Student: Year level (Grouped)', 'string'),
 ('Student: Maori Language Immersion Level', 'string'),
 ('Student: Maori Language in Education level', 'string'),
 ('School: ID', 'int'),
 ('School: Maori Language Descriptor', 'string'),
 ('School: Medium', 'string'),
 ('School: Highest Maori Language Immersion Level', 'string'),
 ('School: Sector', 'string'),
 ('School: Authority', 'string'),
 ('School: Decile', 'string'),
 ('School: Kura Type', 'string'),
 ('Region: TA with Auckland local boards', 'string')]

In [21]:
# exclude irrelevant attributes in population data
cols2=POP.columns
cols2.remove('Euro-all-ages')
cols2.remove('Euro-5-9')
cols2.remove('Euro-10-14')
cols2.remove('Euro-15-19')
cols2.remove('Asian-all-ages')
cols2.remove('Asian-5-9')
cols2.remove('Asian-10-14')
cols2.remove('Asian-15-19')
cols2.remove('Pacific-all-ages')
cols2.remove('Pacific-5-9')
cols2.remove('Pacific-10-14')
cols2.remove('Pacific-15-19')
POP2=POP[cols2]

In [23]:
POP2.dtypes

[('TA with Auckland Local Board', 'string'),
 ('Year at 30 June', 'int'),
 ('Maori-all-ages', 'int'),
 ('Maori-5-9', 'int'),
 ('Maori-10-14', 'int'),
 ('Maori-15-19', 'int'),
 ('Total-all-ages', 'int'),
 ('Total-5-9', 'int'),
 ('Total-10-14', 'int'),
 ('Total-15-19', 'int')]

In [24]:
MLLSN2=MLLSN2.filter(MLLSN2["Students Number"]>0)

In [25]:
MLLSN2.count()

624710

In [26]:
MLLSN2.filter(MLLSN2["Students Number"]<=0).show()

+---------------+------------------+------------------+-----------------------------+---------------------------------------+------------------------------------------+----------+---------------------------------+--------------+----------------------------------------------+--------------+-----------------+--------------+-----------------+-------------------------------------+
|Students Number|Year: As at 1 July|Student: Ethnicity|Student: Year level (Grouped)|Student: Maori Language Immersion Level|Student: Maori Language in Education level|School: ID|School: Maori Language Descriptor|School: Medium|School: Highest Maori Language Immersion Level|School: Sector|School: Authority|School: Decile|School: Kura Type|Region: TA with Auckland local boards|
+---------------+------------------+------------------+-----------------------------+---------------------------------------+------------------------------------------+----------+---------------------------------+--------------+------------

In [34]:
# change data type for Students Number column 
from pyspark.sql.types import StringType

MLLSN2 = MLLSN2.withColumn("School: ID", MLLSN2["School: ID"].cast(StringType()))
MLLSN2 = MLLSN2.withColumn("Year: As at 1 July", MLLSN2["Year: As at 1 July"].cast(StringType()))


In [36]:
MLLSN2.dtypes

[('Students Number', 'int'),
 ('Year: As at 1 July', 'string'),
 ('Student: Ethnicity', 'string'),
 ('Student: Year level (Grouped)', 'string'),
 ('Student: Maori Language Immersion Level', 'string'),
 ('Student: Maori Language in Education level', 'string'),
 ('School: ID', 'string'),
 ('School: Maori Language Descriptor', 'string'),
 ('School: Medium', 'string'),
 ('School: Highest Maori Language Immersion Level', 'string'),
 ('School: Sector', 'string'),
 ('School: Authority', 'string'),
 ('School: Decile', 'string'),
 ('School: Kura Type', 'string'),
 ('Region: TA with Auckland local boards', 'string')]

In [44]:
# aggregation by Student: Year level (Grouped) in the School level
GMLLSN2 = MLLSN2.groupby('School: ID','Year: As at 1 July','Student: Year level (Grouped)','Student: Ethnicity', 
                         'Student: Maori Language Immersion Level','Student: Maori Language in Education level',
                         'School: Highest Maori Language Immersion Level','School: Kura Type','School: Authority',
                         'School: Maori Language Descriptor','School: Medium','School: Decile','School: Sector',
                         'Region: TA with Auckland local boards').sum('Students Number')

In [45]:
GMLLSN2.count()

126408

In [52]:
GMLLSN2.head(1)

[Row(School: ID='497', Year: As at 1 July='2004', Student: Year level (Grouped)='Primary (Year 1-8)', Student: Ethnicity='Maori', Student: Maori Language Immersion Level='Level 1: 81-100%', Student: Maori Language in Education level='Maori medium', School: Highest Maori Language Immersion Level='Level 1: 81-100%', School: Kura Type='Designated Character (Section 156)', School: Authority='State: Not integrated', School: Maori Language Descriptor='Maori medium school', School: Medium='Maori medium', School: Decile='Decile 2', School: Sector='Composite', Region: TA with Auckland local boards='Taupo District', sum(Students Number)=216)]