**This notebook contains a script that determines whether students are eligible for the SETA NQF level 5 Qualification**

**Import libraries**

In [24]:
import pandas as pd
from fuzzywuzzy import process

**Import data into dataframe**

In [52]:
#import cohort data
cohort_df = pd.read_csv("cohort_2020.csv")

#import emis national data 
national_df = pd.read_excel('national.xlsx')

#import national area codes
areacode_df = pd.read_csv('temp_area.csv')

#import city to province map
province_df = pd.read_csv("new_province.csv")

**Merge EMIS number and Areacode**

In [53]:
#subset emis and areacode dataset
national_df = national_df[['NatEmis', 'Suburb', 'Institution_Name']]
areacode_df = areacode_df[['STATSSA_Area_Code', 'Description']]

#convert suburb and description to string
national_df['Suburb'] = national_df['Suburb'].astype(str)
areacode_df['Description'] = areacode_df['Description'].astype(str)

#Strip suburb and discription whitespaces
national_df['Suburb'] = national_df['Suburb'].str.strip()
areacode_df['Description'] = areacode_df['Description'].str.strip()

#Convert suburb and discription to lowercase
national_df['Suburb'] = national_df['Suburb'].str.lower()
areacode_df['Description'] = areacode_df['Description'].str.lower()

#merge emis national dataset with national areacodes
merged_codes_df = national_df.merge(areacode_df, how='left', left_on='Suburb', right_on='Description')

#prep search string for fuzzywuzzy operation
strOptions = [elem for elem in areacode_df['Description']]
strOptions = set(strOptions)
strOptions = list(strOptions)

#prep areas for fuzzywuzzy search
areas = merged_codes_df[merged_codes_df['STATSSA_Area_Code'].isnull()]['Suburb']
areas = set(areas)

#perform fuzzywuzzy search
for area in areas:
    highest = process.extractOne(area, strOptions)
    merged_codes_df.loc[merged_codes_df['Suburb'] == area, 'Description'] = highest[0]
    merged_codes_df.loc[merged_codes_df['Suburb'] == area, 'STATSSA_Area_Code'] = areacode_df.loc[areacode_df['Description'] == highest[0],'STATSSA_Area_Code']

merged_codes_df.head(40)



Unnamed: 0,NatEmis,Suburb,Institution_Name,STATSSA_Area_Code,Description
0,100000029,observatory,GAIA WALDORF SCHOOL,2011-199041024,observatory
1,100000029,observatory,GAIA WALDORF SCHOOL,2011-798015072,observatory
2,100000030,wynberg,DARUN NA'IM ACADEMY,2011-199041054,wynberg
3,100000030,wynberg,DARUN NA'IM ACADEMY,2011-798013053,wynberg
4,100000031,melkbosch strand,ATLANTIC BEACH COLLEGE,,strand sp
5,100000036,knysna central,KNYSNA CHRISTIAN MISSION SCHOOL,2011-180009009,knysna central
6,100000037,wallacedene,"EAGLE'S NEST MINISTRIES, SA",2011-199018019,wallacedene
7,100000038,sunvalley,SILVERMINE ACADEMY,,sun valley
8,100000054,klein zevenwacht,ACADEMY PRIVATE SCHOOL,,zevenwacht
9,100000055,claremont,CLAREMONT HIGH SCHOOL,2011-798016089,claremont


In [54]:
merged_codes_df.head(40)

Unnamed: 0,NatEmis,Suburb,Institution_Name,STATSSA_Area_Code,Description
0,100000029,observatory,GAIA WALDORF SCHOOL,2011-199041024,observatory
1,100000029,observatory,GAIA WALDORF SCHOOL,2011-798015072,observatory
2,100000030,wynberg,DARUN NA'IM ACADEMY,2011-199041054,wynberg
3,100000030,wynberg,DARUN NA'IM ACADEMY,2011-798013053,wynberg
4,100000031,melkbosch strand,ATLANTIC BEACH COLLEGE,,strand sp
5,100000036,knysna central,KNYSNA CHRISTIAN MISSION SCHOOL,2011-180009009,knysna central
6,100000037,wallacedene,"EAGLE'S NEST MINISTRIES, SA",2011-199018019,wallacedene
7,100000038,sunvalley,SILVERMINE ACADEMY,,sun valley
8,100000054,klein zevenwacht,ACADEMY PRIVATE SCHOOL,,zevenwacht
9,100000055,claremont,CLAREMONT HIGH SCHOOL,2011-798016089,claremont


**List columns in cohort data**

In [268]:
##Create a subset of cohort dataframe
cohort_df = cohort_df[["Firstname","Surname", "ID Number/ Passport Number", "Birth Date", "Gender", "Disability", "Ethnicity", "Nationality", "Residential Status (Citizen / Permanent Resident/Asylum Seeker/Work Permit/Study Permit)", "Home Language", "City", "Passed Grade 12 Y/N", "Municipality", "Name of High School", "Physical Address while studying", "Home Address", "Year Completed Grade 12"]]
#national_df = national_df[["NatEmis", "Province", "Institution_Name", "DistrictMunicipalityName", "Local MunicipalityName"]]
#cohort_df.head()
province_df.head()
#national_df.head()

Unnamed: 0,City,Province
0,Aan de Doorns,Western Cape
1,Aberdeen,Eastern Cape
2,Aberfeldy,Free State
3,Abbotsdale,Western Cape
4,Acornhoek,Mpumalanga


**Drop rows with nan in Municipalities**

In [186]:
national_df = national_df[national_df["Local MunicipalityName"].notna()]
national_df = national_df[national_df["Institution_Name"].notna()]
national_df.head()

Unnamed: 0,NatEmis,Province,Institution_Name,DistrictMunicipalityName,Local MunicipalityName
0,100000029,WC,GAIA WALDORF SCHOOL,City of Cape Town Metropolitan Municipality,City of Cape Town Metropolitan Municipality
1,100000030,WC,DARUN NA'IM ACADEMY,City of Cape Town Metropolitan Municipality,City of Cape Town Metropolitan Municipality
2,100000031,WC,ATLANTIC BEACH COLLEGE,City of Cape Town Metropolitan Municipality,City of Cape Town Metropolitan Municipality
3,100000036,WC,KNYSNA CHRISTIAN MISSION SCHOOL,Eden,Knysna Local Municipality
4,100000037,WC,"EAGLE'S NEST MINISTRIES, SA",City of Cape Town Metropolitan Municipality,City of Cape Town Metropolitan Municipality


**Create New ID Columns**

In [187]:
##Create new IDNumber column
cohort_df["IDNumber"] = [f"{id_num}".split("\\")[0].strip() for id_num in cohort_df["ID Number/ Passport Number"]]

#Create new ID Type column
cohort_df["ID Type"] = ["National ID" if (len(f"{id_num}") == 13 or len(f"{id_num}") == 12) else "Passport Number" for id_num in cohort_df["IDNumber"]]

**Add Province Column**

In [269]:
cohort_df["City2"] = cohort_df["Home Address"].str.split(",").str[-2]
province_df["City"] = province_df["City"].str.lower().str.strip()
cohort_df["City2"] = cohort_df["City2"].astype(str)
cohort_df["City2"] = cohort_df["City2"].str.lower().str.strip()
cohort_df["Province"] = [province_df[province_df["City"] == city].Province.values for city in cohort_df["City2"]]
cohort_df["Province"] = cohort_df["Province"].astype(str)
for char in ["[", "]", "'"]:
    cohort_df["Province"] = cohort_df["Province"].str.replace(char,"") 
cohort_df[["City2","Province"]].head()



  cohort_df["Province"] = cohort_df["Province"].str.replace(char,"")


Unnamed: 0,City2,Province
0,pretoria,Gauteng
1,johannesburg,Gauteng
2,johannesburg,Gauteng
3,,
4,mamelodi east,


**Split Adressess into separate columns**

In [1]:
#cohort_df["Physical Address while studying"] = cohort_df["Physical Address while studying"].str.split(",")


#Split physical address into postal code
cohort_df["Physical Post Code"] = cohort_df["Physical Address while studying"].str.split(",").str[-1]

#Split physical address into city
cohort_df["Physical Address 3"] = cohort_df["Physical Address while studying"].str.split(",").str[-2]

#Split physical address into area
cohort_df["Physical Address 2"] = cohort_df["Physical Address while studying"].str.split(",").str[-3]

#Split physical address into street
cohort_df["Physical Address 1"] = cohort_df["Physical Address while studying"].str.split(",").str[0]

#Create physical province
cohort_df["Physical Province"] = cohort_df["Province"]

cohort_df[["Physical Address 1", "Physical Address 2", "Physical Address 3", "Physical Post Code", "Physical Province"]]

NameError: name 'cohort_df' is not defined

**Split Postal Address into separate columns**

In [274]:
#Split physical address into postal code
cohort_df["Postal Post Code"] = cohort_df["Home Address"].str.split(",").str[-1]

#Split physical address into city
cohort_df["Postal Address 3"] = cohort_df["Home Address"].str.split(",").str[-2]

#Split physical address into area
cohort_df["Postal Address 2"] = cohort_df["Home Address"].str.split(",").str[-3]

#Split physical address into street
cohort_df["Postal Address 1"] = cohort_df["Home Address"].str.split(",").str[0]

#Create physical province
cohort_df["Postal Province"] = cohort_df["Province"]

cohort_df[["Postal Address 1", "Postal Address 2", "Postal Address 3", "Postal Post Code", "Postal Province"]]

Unnamed: 0,Postal Address 1,Postal Address 2,Postal Address 3,Postal Post Code,Postal Province
0,38144 Extension 18,Mamelodi East,Pretoria,0122,Gauteng
1,66 8th Avenue,Bezuidenhout Valley,Johannesburg,2094,Gauteng
2,84 Fox Street,Marshalltown,Johannesburg,2001,Gauteng
3,,,,,
4,38144 Extension 18,Mnisi Avenue,Mamelodi East,0122,
...,...,...,...,...,...
306,6 Garden Drive,6 Garden Drive,Sunningdale,Parklands,
307,F509 Mankayi Crescent,F-section,Khayelitsha,7785,Western Cape
308,69 Dorothy Rd,Amalinda,East London,5247,Eastern Cape
309,131 Campground Road,131 Campground Road,Newlands,7700,


**Create EMIS columns**

In [278]:
national_df["Institution_Name"] = national_df["Institution_Name"].str.lower()
cohort_df["Name of High School"] = cohort_df["Name of High School"].astype(str)
cohort_df["Name of High School"] = cohort_df["Name of High School"].str.lower()

cohort_df["Emis Number"] = [national_df[national_df["Institution_Name"].str.find(school) != -1].NatEmis.values for school in cohort_df["Name of High School"]]
cohort_df["Emis Number"] = [elem[0] if len(elem) > 0 else "" for elem in cohort_df["Emis Number"]]
cohort_df["Emis Number"].head(30)

0              
1     700130641
2              
3              
4              
5              
6              
7     700160390
8     200200750
9              
10    700160390
11    600101147
12    700260471
13    800005942
14             
15    200200797
16    700211235
17             
18             
19             
20             
21             
22             
23             
24    700330175
25             
26    700120840
27    700162016
28    700270116
29             
Name: Emis Number, dtype: object

In [279]:
cohort_df.head()

Unnamed: 0,Firstname,Surname,ID Number/ Passport Number,Birth Date,Gender,Disability,Ethnicity,Nationality,Residential Status (Citizen / Permanent Resident/Asylum Seeker/Work Permit/Study Permit),Home Language,...,Physical Address 3,Physical Address 2,Physical Address 1,Physical Province,Postal Post Code,Postal Address 3,Postal Address 2,Postal Address 1,Postal Province,Emis Number
0,Athenkosi,Dyoli,9509215919086,1995-09-21,male,no,black,South African,Citizen,isiXhosa,...,Pretoria,Mamelodi East,8144 Extension 18,[],122.0,Pretoria,Mamelodi East,38144 Extension 18,Gauteng,
1,Abigail,Hlalele,5020136080,2000-05-02,Female,no,black,South African,Citizen,English,...,Johannesburg,Bezuidenhout valley,66 8th Avenue,[],2094.0,Johannesburg,Bezuidenhout Valley,66 8th Avenue,Gauteng,700130641.0
2,Arthur,Jenkins,3255794087,2000-03-25,male,no,white,South African,Citizen,Afrikaans,...,Johannesburg,Marshalltown,84 Fox Street,[],2001.0,Johannesburg,Marshalltown,84 Fox Street,Gauteng,
3,Amon,Munyai,108255738081,2001-08-25,male,no,black,South African,Citizen,,...,,,,[],,,,,,
4,Majane,Thotse,9704115450080,1997-04-11,male,no,black,South African,Citizen,Sepedi [also known as Northern Sotho / Sesotho...,...,Boksburg,Windmill Park Extension 12,5456 Kgaka Cresent,[Gauteng],122.0,Mamelodi East,Mnisi Avenue,38144 Extension 18,,
