**This notebook contains a script that determines whether students are eligible for the SETA NQF level 5 Qualification**

**Import Libraries**

In [148]:
import pandas as pd
import numpy as np
import gspread
import gspread_dataframe as gd
from fuzzywuzzy import process

**Authenticate Gspread**

In [149]:
gc = gspread.oauth()

**Create Worksheets**

In [150]:
#cohort 2020 data worksheet
cohort_2020_data_worksheet = gc.open("Cohort 2020 Data").worksheet('Cohort 2020 Data')

**Import Data into Dataframes**

In [151]:
#import cohort data
cohort_df = gd.get_as_dataframe(cohort_2020_data_worksheet)

#import emis national data 
national_df = pd.read_excel('national.xlsx')

#import national area codes
areacode_df = pd.read_csv('temp_area.csv')

#import city to province map
province_df = pd.read_csv("new_province.csv")

#import municipality data
municipality_df = pd.read_csv("municipality.csv")

**Select Only Active Students Without the Qualification**

In [152]:
#strip all columns
all_columns = ['Active / Not active/Early absorption','Name of Highest Qualification Achieved']
for column in all_columns:
    cohort_df[column] = cohort_df[column].astype(str)
    cohort_df[column] = cohort_df[column].str.strip()

cohort_df = cohort_df[cohort_df['Active / Not active/Early absorption'] == 'Active']
cohort_df = cohort_df[~cohort_df['Name of Highest Qualification Achieved'].str.contains('Comp')]

**Subset Datasets**

In [153]:
#subset emis and areacode dataset
national_df = national_df[['NatEmis', 'Suburb', 'Institution_Name']]
areacode_df = areacode_df[['STATSSA_Area_Code', 'Description']]

#subset cohort data
cohort_df = cohort_df[["Firstname","Surname", "ID Number/ Passport Number", "Birth Date", "Gender", "Disability", "Ethnicity", "Nationality", "Residential Status (Citizen / Permanent Resident/Asylum Seeker/Work Permit/Study Permit)", "Home Language", "City", "Passed Grade 12 Y/N", "Municipality", "Name of High School", "Physical Address while studying", "Home Address", "Year Completed Grade 12"]]


**Drop NAN Rows**

In [154]:
#drop rows with null value in institution column
national_df = national_df[national_df["Institution_Name"].notna()]


**Create ID Column**

In [155]:
#create new idnumber column
cohort_df["IDNumber"] = [f"{id_num}".split("\\")[0].strip() for id_num in cohort_df["ID Number/ Passport Number"]]

#create new id type column
cohort_df["ID Type"] = ["National ID" if (len(f"{id_num}") == 13 or len(f"{id_num}") == 12) else "Passport Number" for id_num in cohort_df["IDNumber"]]

**Add Province**

In [156]:
#create second city column
cohort_df["City2"] = cohort_df["Home Address"].str.split(",").str[-2]

#convert second city column to type string
cohort_df["City2"] = cohort_df["City2"].astype(str)

#strip columns whitespaces
cohort_df["City2"] = cohort_df["City2"].str.strip()
province_df["City"] = province_df["City"].str.strip()

#set columns to lowercase
cohort_df["City2"] = cohort_df["City2"].str.lower()
province_df["City"] = province_df["City"].str.lower()

#prep search strings for fuzzywuzzy operation
strOptions = [elem for elem in province_df['City']]
strOptions = set(strOptions)
strOptions = list(strOptions)

#prep areas for fuzzywuzzy search
areas = [elem for elem in cohort_df["City2"]]
areas = set(areas)

#perform fuzzywuzzy matching
for area in areas:
    highest = process.extractOne(area, strOptions)
    cohort_df.loc[cohort_df['City2'] == area, 'Province'] = province_df.loc[province_df['City'] == highest[0], 'Province'].values[0]



**Split Physical Address**

In [157]:
#Split physical address into postal code
cohort_df["Physical Post Code"] = cohort_df["Physical Address while studying"].str.split(",").str[-1]

#Split physical address into city
cohort_df["Physical Address 3"] = cohort_df["Physical Address while studying"].str.split(",").str[-2]

#Split physical address into area
cohort_df["Physical Address 2"] = cohort_df["Physical Address while studying"].str.split(",").str[-3]

#Split physical address into street
cohort_df["Physical Address 1"] = cohort_df["Physical Address while studying"].str.split(",").str[0]

#Create physical province
cohort_df["Physical Province"] = cohort_df["Province"]

cohort_df[["Physical Address 1", "Physical Address 2", "Physical Address 3", "Physical Post Code", "Physical Province"]].head()

Unnamed: 0,Physical Address 1,Physical Address 2,Physical Address 3,Physical Post Code,Physical Province
1,66 8th Avenue,Bezuidenhout valley,Johannesburg,2094,Gauteng
2,84 Fox Street,Marshalltown,Johannesburg,2001,Gauteng
4,5456 Kgaka Cresent,Windmill Park Extension 12,Boksburg,1459,Gauteng
5,33 Kabul Crescent,Cosmo City,Randburg,2194,Gauteng
6,Glencairn Trafalgar,Corner Market & Eloff Street,Johannesburg,2000,KwaZulu-Natal


**Split Postal Address**

In [158]:
#Split physical address into postal code
cohort_df["Postal Post Code"] = cohort_df["Home Address"].str.split(",").str[-1]

#Split physical address into city
cohort_df["Postal Address 3"] = cohort_df["Home Address"].str.split(",").str[-2]

#Split physical address into area
cohort_df["Postal Address 2"] = cohort_df["Home Address"].str.split(",").str[-3]

#Split physical address into street
cohort_df["Postal Address 1"] = cohort_df["Home Address"].str.split(",").str[0]

#Create postal province
cohort_df["Postal Province"] = cohort_df["Province"]

cohort_df[["Postal Address 1", "Postal Address 2", "Postal Address 3", "Postal Post Code", "Postal Province"]].head()

Unnamed: 0,Postal Address 1,Postal Address 2,Postal Address 3,Postal Post Code,Postal Province
1,66 8th Avenue,Bezuidenhout Valley,Johannesburg,2094,Gauteng
2,84 Fox Street,Marshalltown,Johannesburg,2001,Gauteng
4,38144 Extension 18,Mnisi Avenue,Mamelodi East,122,Gauteng
5,33 Kabul Crescent,Cosmo City,Randburg,2194,Gauteng
6,(46) B626 Asikhulume Lane,Durban,Umlazi,4066,KwaZulu-Natal


**Prepare EMIS and AREA Code Datasets**

In [159]:
#subset emis and areacode dataset
national_df = national_df[['NatEmis', 'Suburb', 'Institution_Name']]
areacode_df = areacode_df[['STATSSA_Area_Code', 'Description']]

#convert suburb and description to string
national_df['Suburb'] = national_df['Suburb'].astype(str)
national_df['Institution_Name'] = national_df['Institution_Name'].astype(str)
areacode_df['Description'] = areacode_df['Description'].astype(str)
cohort_df['Name of High School'] = cohort_df['Name of High School'].astype(str)

#Strip suburb and discription whitespaces
national_df['Suburb'] = national_df['Suburb'].str.strip()
areacode_df['Description'] = areacode_df['Description'].str.strip()

#Convert suburb and discription to lowercase
national_df['Suburb'] = national_df['Suburb'].str.lower()
areacode_df['Description'] = areacode_df['Description'].str.lower()



**Create EMIS column**

In [160]:
#prep search strings for fuzzywuzzy operation
strOptions = [elem for elem in national_df['Institution_Name']]
strOptions = set(strOptions)
strOptions = list(strOptions)

#prep areas for fuzzywuzzy search
areas = [elem for elem in cohort_df["Name of High School"]]
areas = set(areas)

#perform fuzzywuzzy matching
for area in areas:
    highest = process.extractOne(area, strOptions)
    cohort_df.loc[cohort_df['Name of High School'] == area, 'EMIS'] = national_df.loc[national_df['Institution_Name'] == highest[0], 'NatEmis'].values[0]
    cohort_df.loc[cohort_df['Name of High School'] == area, 'Area'] = national_df.loc[national_df['Institution_Name'] == highest[0], 'Suburb'].values[0]


    
cohort_df[["EMIS", "Area"]].head()

Unnamed: 0,EMIS,Area
1,700130641.0,
2,700220178.0,pretoria
4,700350371.0,nigel
5,930350040.0,extension
6,200501377.0,ntabankulu


**Format Emis Number**

In [161]:
cohort_df['EMIS'] = cohort_df['EMIS'].astype(int)
#eligible_df['EMIS'] = eligible_df['EMIS'].astype(str)

**Create AreaCode column**

In [162]:
#prep search strings for fuzzywuzzy operation
strOptions = [elem for elem in areacode_df['Description']]
strOptions = set(strOptions)
strOptions = list(strOptions)

#prep areas for fuzzywuzzy search
areas = [elem for elem in cohort_df["Area"]]
areas = set(areas)

#perform fuzzywuzzy matching
for area in areas:
    highest = process.extractOne(area, strOptions)
    cohort_df.loc[cohort_df['Area'] == area, 'AreaCode'] = areacode_df.loc[areacode_df['Description'] == highest[0], 'STATSSA_Area_Code'].values[0]
    
cohort_df[['EMIS', 'AreaCode']].head()
    



Unnamed: 0,EMIS,AreaCode
1,700130641,2011-286025002
2,700220178,2011-799035104
4,700350371,2011-797028011
5,930350040,2011-370004002
6,200501377,2011-581108001


**Reformat Municipality**

In [163]:
#convert second city column to type string
cohort_df["Municipality"] = cohort_df["Municipality"].astype(str)
municipality_df["municipality"] = municipality_df["municipality"].astype(str)

#strip columns whitespaces
cohort_df["Municipality"] = cohort_df["Municipality"].str.strip()
municipality_df["municipality"] = municipality_df["municipality"].str.strip()

#strip columns whitespaces
cohort_df["Municipality"] = cohort_df["Municipality"].str.lower()
municipality_df["municipality"] = municipality_df["municipality"].str.lower()

#prep search strings for fuzzywuzzy operation
strOptions = [elem for elem in municipality_df['municipality']]
strOptions = set(strOptions)
strOptions = list(strOptions)

#prep areas for fuzzywuzzy search
areas = [elem for elem in cohort_df["Municipality"]]
areas = set(areas)

#perform fuzzywuzzy matching
for area in areas:
    highest = process.extractOne(area, strOptions)
    cohort_df.loc[cohort_df['Municipality'] == area, 'municipality2'] = highest[0]

cohort_df[["Municipality", "municipality2"]].head()

Unnamed: 0,Municipality,municipality2
1,johannesburg metro,2191 johannesburg metro bryanston west bryanston
2,johannesburg metro,2191 johannesburg metro bryanston west bryanston
4,ekurhuleni metro,1574 ekurhuleni metro endicott springs
5,johannesburg metro,2191 johannesburg metro bryanston west bryanston
6,johannesburg metro,2191 johannesburg metro bryanston west bryanston


**Create New Columns**

In [164]:
#set marital status
cohort_df['MaritalStatus'] = "Single"

#set getc qualification
cohort_df['GETC Qualification'] = "GRADE 12"

#set popi act
cohort_df['Agree Popi Act'] = "Yes"

cohort_df[['MaritalStatus', 'GETC Qualification', 'Agree Popi Act']].head()

Unnamed: 0,MaritalStatus,GETC Qualification,Agree Popi Act
1,Single,GRADE 12,Yes
2,Single,GRADE 12,Yes
4,Single,GRADE 12,Yes
5,Single,GRADE 12,Yes
6,Single,GRADE 12,Yes


**Restructure Cohort Dataset**

In [165]:
#reorder and subset cohort dataset columns
cohort_df = cohort_df[['Firstname', 'Surname', 'IDNumber', 'ID Type', 'Birth Date', 'Gender', 'Disability', 
         'Ethnicity', 'Nationality',
       'Residential Status (Citizen / Permanent Resident/Asylum Seeker/Work Permit/Study Permit)',
       'Home Language', 'MaritalStatus', 'Province', 'GETC Qualification', 'municipality2', 'Physical Address 1', 'Physical Address 2',
       'Physical Address 3', 'Physical Post Code',  'Physical Province',
       'Postal Address 1', 'Postal Address 2', 'Postal Address 3',
       'Postal Post Code', 'Postal Province', 'EMIS', 'Year Completed Grade 12', 'AreaCode', 'Agree Popi Act',
       'Passed Grade 12 Y/N']]

#rename cohort dataset columns
cohort_df.columns = ['Name', 'Surname', 'IDNumber', 'ID Type', 'DateOfBirth(yyyy/mm/dd)', 'Gender',
                     'DisabilityStatus', 'Equity', 'Nationality', 'ResidentialStatus', 'Language',
                     'MaritalStatus', 'Province', 'GETC Qualification', 'Municipality', 'Physical Address 1',
                     'Physical Address 2', 'Physical  Address 3', 'Physical Post Code', 'Physical Province',
                     'Postal Address 1', 'Postal Address 2', 'Postal  Address 3', 'Postal Post Code',
                     'Postal Province', 'Emis Number', 'Last School Year', 'Area Code', 'Agree Popi Act',
                     'Grade 12']

cohort_df.head()

Unnamed: 0,Name,Surname,IDNumber,ID Type,DateOfBirth(yyyy/mm/dd),Gender,DisabilityStatus,Equity,Nationality,ResidentialStatus,...,Postal Address 1,Postal Address 2,Postal Address 3,Postal Post Code,Postal Province,Emis Number,Last School Year,Area Code,Agree Popi Act,Grade 12
1,Abigail,Hlalele,5020136080,National ID,2000-05-02,Female,no,black,South African,Citizen,...,66 8th Avenue,Bezuidenhout Valley,Johannesburg,2094,Gauteng,700130641,2018,2011-286025002,Yes,Y
2,Arthur,Jenkins,3255794087,National ID,2000-03-25,male,no,white,South African,Citizen,...,84 Fox Street,Marshalltown,Johannesburg,2001,Gauteng,700220178,-,2011-799035104,Yes,N
4,Majane,Thotse,9704115450080,National ID,1997-04-11,male,no,black,South African,Citizen,...,38144 Extension 18,Mnisi Avenue,Mamelodi East,122,Gauteng,700350371,2015,2011-797028011,Yes,Y
5,Anza,Mugwabana,9406035830082,National ID,1994-06-03,male,no,black,South African,Citizen,...,33 Kabul Crescent,Cosmo City,Randburg,2194,Gauteng,930350040,2011,2011-370004002,Yes,Y
6,Andiswa,Nombela,9512140773081,National ID,1995-12-14,Female,no,black,South African,Citizen,...,(46) B626 Asikhulume Lane,Durban,Umlazi,4066,KwaZulu-Natal,200501377,2013,2011-581108001,Yes,Y


**Format Date of Biirth**

In [166]:
#convert dob to datetime formatt
cohort_df['DateOfBirth(yyyy/mm/dd)'] = pd.to_datetime(cohort_df['DateOfBirth(yyyy/mm/dd)'])

#convert dob cooulm to yy/mm/dd formatt
cohort_df['DateOfBirth(yyyy/mm/dd)'] = cohort_df['DateOfBirth(yyyy/mm/dd)'].dt.strftime('%Y/%d/%m')

cohort_df['DateOfBirth(yyyy/mm/dd)']

1      2000/02/05
2      2000/25/03
4      1997/11/04
5      1994/03/06
6      1995/14/12
          ...    
305    2000/13/11
307    1993/01/04
308    1991/15/08
309    1996/10/08
310    2001/05/08
Name: DateOfBirth(yyyy/mm/dd), Length: 265, dtype: object

**Format Language**

In [167]:
#strip equity column whitespaces
cohort_df['Language'] = cohort_df['Language'].str.strip()

#convert equity column to lowercase
cohort_df['Language'] = cohort_df['Language'].str.lower()

#reformat equity column
cohort_df['Language'] = cohort_df['Language'].replace({'isixhosa': 'isiXhosa', 'english': 'English',
                             'sepedi [also known as northern sotho / sesotho sa lebowa]': 'sePedi', 'tshivenda': 'tshiVenda',
                             'isizulu': 'isiZulu', 'setswana': 'seTswana', 'afrikaans': 'Afrikaans', 'xitsonga': 'xiTsonga',
                                'sepedi': 'sePedi', 'sesotho': 'seSotho', 'isindebele': 'isiNdebele','siswati': 'siSwati',  'other': 'English'})

cohort_df['Language'].unique()


array(['English', 'Afrikaans', 'sePedi', 'tshiVenda', 'isiZulu',
       'seTswana', 'xiTsonga', 'seSotho', 'isiNdebele', 'siSwati',
       'isiXhosa'], dtype=object)

**Format Equity**

In [168]:
#strip equity column whitespaces
cohort_df['Equity'] = cohort_df['Equity'].str.strip()

#convert equity column to lowercase
cohort_df['Equity'] = cohort_df['Equity'].str.lower()

#reformat equity column
cohort_df['Equity'] = cohort_df['Equity'].replace({'black': 'Black: African', 'coloured': 'Black: Coloured',
                             'indian': 'Black: Indian/Asian', 'asian': 'Black: Indian/Asian',
                             'chinese': 'Black: Indian/Asian', 'white': 'White'})

cohort_df['Equity'].unique()

array(['Black: African', 'White', 'Black: Indian/Asian',
       'Black: Coloured'], dtype=object)

**Format Disability**

In [169]:
#strip disability column whitespaces
cohort_df['DisabilityStatus'] = cohort_df['DisabilityStatus'].astype(str)

#strip disability column whitespaces
cohort_df['DisabilityStatus'] = cohort_df['DisabilityStatus'].str.strip()

#convert disability column to lowercase
cohort_df['DisabilityStatus'] = cohort_df['DisabilityStatus'].str.lower()

#reformat disability column
cohort_df['DisabilityStatus'] = cohort_df['DisabilityStatus'].replace({'no': 'None', 'nan': 'None',
                             'yes': 'Disable but unspecified', 'yes - specwearer': 'None',
                             'n': 'None', '-': 'None', 'visual impairment - spec wearer': 'None',
                             'aspergers / autistic': 'Emotional (behav/psych)', 'patella alta':'Physical (move/stand etc)',
                             'y- specwearer': 'None', 'add': 'None', 'visual impairment - spec wearer -': 'None',
                             '\\n': 'None', 'y - specwearer':'None', 'none':'None', '':'None'})

cohort_df['DisabilityStatus'].unique()

array(['None', 'Disable but unspecified', 'Emotional (behav/psych)',
       'Physical (move/stand etc)'], dtype=object)

**Format Province**

In [170]:
#strip equity column whitespaces
cohort_df['Province'] = cohort_df['Province'].str.strip()

#convert equity column to lowercase
cohort_df['Province'] = cohort_df['Province'].str.lower()

#reformat equity column
cohort_df['Province'] = cohort_df['Province'].replace({'gauteng': 'Gauteng', 'kwazulu-natal': 'Kwazulu-Natal',
                             'mpumalanga': 'Mpumalanga', 'eastern cape': 'Eastern Cape',
                             'limpopo': 'Limpopo', 'northern cape': 'Northern Cape', 'free state': 'Free State', 'north west': 'North West',
                                'western cape': 'Western Cape'})

cohort_df['Province'].unique()

array(['Gauteng', 'Kwazulu-Natal', 'Western Cape', 'Eastern Cape',
       'Limpopo', 'Northern Cape', 'Mpumalanga', 'Free State',
       'North West'], dtype=object)

**Format Nationality**

In [171]:
#strip equity column whitespaces
cohort_df['Nationality'] = cohort_df['Nationality'].str.strip()

#convert equity column to lowercase
cohort_df['Nationality'] = cohort_df['Nationality'].str.lower()

#reformat equity column
cohort_df['Nationality'] = cohort_df['Nationality'].replace({'south african': 'South African', 'dutch': 'European countries',
                             'nigerian': 'Rest of Africa', 'zimbabwean': 'Zimbabwe', 'congolese':'Rest of Africa'})

cohort_df['Nationality'].unique()

array(['South African', 'Zimbabwe', 'European countries',
       'Rest of Africa'], dtype=object)

**Format Physical Province**

In [172]:
#strip equity column whitespaces
cohort_df['Physical Province'] = cohort_df['Physical Province'].str.strip()

#convert equity column to lowercase
cohort_df['Physical Province'] = cohort_df['Physical Province'].str.lower()

#reformat equity column
cohort_df['Physical Province'] = cohort_df['Physical Province'].replace({'gauteng': 'Gauteng', 'kwazulu-natal': 'Kwazulu-Natal',
                             'mpumalanga': 'Mpumalanga', 'eastern cape': 'Eastern Cape',
                             'limpopo': 'Limpopo', 'northern cape': 'Northern Cape', 'free state': 'Free State', 'north west': 'North West',
                                'western cape': 'Western Cape'})

cohort_df['Physical Province'].unique()

array(['Gauteng', 'Kwazulu-Natal', 'Western Cape', 'Eastern Cape',
       'Limpopo', 'Northern Cape', 'Mpumalanga', 'Free State',
       'North West'], dtype=object)

**Format Postal Province**

In [173]:
#strip equity column whitespaces
cohort_df['Postal Province'] = cohort_df['Postal Province'].str.strip()

#convert equity column to lowercase
cohort_df['Postal Province'] = cohort_df['Postal Province'].str.lower()

#reformat equity column
cohort_df['Postal Province'] = cohort_df['Postal Province'].replace({'gauteng': 'Gauteng', 'kwazulu-natal': 'Kwazulu-Natal',
                             'mpumalanga': 'Mpumalanga', 'eastern cape': 'Eastern Cape',
                             'limpopo': 'Limpopo', 'northern cape': 'Northern Cape', 'free state': 'Free State', 'north west': 'North West',
                                'western cape': 'Western Cape'})

cohort_df['Postal Province'].unique()

array(['Gauteng', 'Kwazulu-Natal', 'Western Cape', 'Eastern Cape',
       'Limpopo', 'Northern Cape', 'Mpumalanga', 'Free State',
       'North West'], dtype=object)

**Split Into Eligible and Non-Eligble Datasets**

In [174]:
#incomplete datasets
incomplete_df = cohort_df[cohort_df[['Name', 'Surname', 'IDNumber', 'ID Type', 'DateOfBirth(yyyy/mm/dd)',
                                    'DisabilityStatus', 'Equity', 'Nationality', 'ResidentialStatus',
                                    'Grade 12']].isnull().any(axis=1)]

#eligible dataset
eligible_df = cohort_df.dropna(subset=['Name', 'Surname', 'IDNumber', 'ID Type', 'DateOfBirth(yyyy/mm/dd)',
                                    'DisabilityStatus', 'Equity', 'Nationality', 'ResidentialStatus',
                                    'Grade 12'])

#missing grade 12
no_grade_12_df = eligible_df[eligible_df['Last School Year'].isin(['-'])]

#foreign grade 12
#foreign_grade_12_df = eligible_df[eligible_df['Grade 12'].isin(['Y - Foreign'])]
                             

**Remove Incomplete Data from Eligible Dataset**

In [175]:
#remove students without grade 12 from the eligibility list
#eligible_df = eligible_df[~eligible_df['IDNumber'].isin(no_grade_12_df['IDNumber'])]

#remove students with foreign grade 12 certs from eligible list
#eligible_df = eligible_df[~eligible_df['IDNumber'].isin(foreign_grade_12_df['IDNumber'])]

**Combine Ineligible datasets**

In [176]:
#add issue column to incomplete dataset
incomplete_df['issue'] = "Incomplete Data."

#add issue column to no grade 12 dataset
no_grade_12_df['issue'] = "Did not complete grade 12."

#add issue column to foreign grade 12 dataset
#foreign_grade_12_df['issue'] = "Foreign grade 12 certificate."

#append dataframes
not_eligible_df = incomplete_df.append(no_grade_12_df)
#not_eligible_df = not_eligible_df.append(foreign_grade_12_df)

#subset not eligible dataset
#not_eligible_df = not_eligible_df[['issue', 'Name', 'Surname', 'IDNumber', 'ID Type', 'DateOfBirth(yyyy/mm/dd)',
#                                    'DisabilityStatus', 'Equity', 'Nationality', 'ResidentialStatus',
#                                    'Grade 12']]

not_eligible_df.head()

Unnamed: 0,Name,Surname,IDNumber,ID Type,DateOfBirth(yyyy/mm/dd),Gender,DisabilityStatus,Equity,Nationality,ResidentialStatus,...,Postal Address 2,Postal Address 3,Postal Post Code,Postal Province,Emis Number,Last School Year,Area Code,Agree Popi Act,Grade 12,issue
84,Lloyd,Ndhlovu,CN806411,Passport Number,1992/23/02,male,,Black: African,Zimbabwe,,...,MarshallTown,Johannesburg,2107,Gauteng,700232702,2009,2011-499023018,Yes,Y,Incomplete Data.
2,Arthur,Jenkins,0003255794087,National ID,2000/25/03,male,,White,South African,Citizen,...,Marshalltown,Johannesburg,2001,Gauteng,700220178,-,2011-799035104,Yes,N,Did not complete grade 12.
43,Happy,Nkanyane,9612315769086,National ID,1996/31/12,male,,Black: African,South African,Citizen,...,New Eersterus,Hammanskraal,401,Gauteng,600102293,-,2011-764002043,Yes,N,Did not complete grade 12.
45,Heidi,Steenkamp,9703210026084,National ID,1994/28/05,Female,,White,South African,Citizen,...,Marshall Town,Johannesburg,2107,Gauteng,700120576,-,2011-798015089,Yes,N,Did not complete grade 12.
92,Luke,Templeman,0007305118080,National ID,2000/30/07,male,Disable but unspecified,White,South African,Citizen,...,Johannesburg,CBD,2001,Eastern Cape,500440041,-,2011-286025002,Yes,N,Did not complete grade 12.


**Convert Postal Codes to Numeric**

In [177]:
#strip all columns
all_columns = ['Physical Post Code','Postal Post Code']
for column in all_columns:
    eligible_df[column] = eligible_df[column].astype(str)
    eligible_df[column] = eligible_df[column].str.strip()
    
    #update non numeric postal codes
    eligible_df.loc[~eligible_df[column].str.isnumeric(),column] = "2020"
    
#update non numeric postal codes
#eligible_df.loc[~eligible_df['Postal Post Code'].str.isnumeric(),'Postal Post Code'] = "2020"
#eligible_df.loc[~eligible_df['Physical Post Code'].str.isnumeric(),'Physical Post Code'] = "2020"

**Capitalize some Columns**

In [178]:
#strip all columns
all_columns = ['Gender','DisabilityStatus']
for column in all_columns:
    eligible_df[column] = eligible_df[column].astype(str)
    eligible_df[column] = eligible_df[column].str.strip()
    
    #capitalise columns
    eligible_df[column] = eligible_df[column].str.capitalize()
    

**Reformat ResidentialStatus**

In [179]:
#reformat disability column
eligible_df['ResidentialStatus'] = eligible_df['ResidentialStatus'].replace({'Citizen': 'South Africa'})
eligible_df['ResidentialStatus'].unique()

array(['South Africa', 'Permanent Resident'], dtype=object)

**Capitalise Municipality**

In [180]:
#strip all columns
all_columns = ['Municipality']
for column in all_columns:
    eligible_df[column] = eligible_df[column].astype(str)
    eligible_df[column] = eligible_df[column].str.strip()
    
    #capitalise columns
    eligible_df[column] = eligible_df[column].str.title()

**Drop Some columns**

In [181]:
eligible_df = eligible_df[['Name', 'Surname', 'IDNumber', 'ID Type', 'DateOfBirth(yyyy/mm/dd)',
       'Gender', 'DisabilityStatus', 'Equity', 'Nationality',
       'ResidentialStatus', 'Language', 'MaritalStatus', 'Province',
       'GETC Qualification', 'Municipality', 'Physical Address 1',
       'Physical Address 2', 'Physical  Address 3', 'Physical Post Code',
       'Physical Province', 'Postal Address 1', 'Postal Address 2',
       'Postal  Address 3', 'Postal Post Code', 'Postal Province',
       'Emis Number', 'Last School Year', 'Area Code', 'Agree Popi Act']]

**Write Datasets to Excel file**

In [None]:
#write eligible dataset to eligible file
eligible_df.to_excel("seta_eligible_second_upload.xlsx", index=False)

#write ineligible dataset to ineligible file
not_eligible_df.to_excel("ineligible.xlsx")

In [15]:
eligible_df = pd.read_excel("seta_eligible_second_upload.xlsx")
eligible_df['Name'] = eligible_df['Name'].astype(str).str.strip()
eligible_df['Surname'] = eligible_df['Surname'].astype(str).str.strip()

In [206]:
condition_surname = eligible_df['Surname'] == 'van der Merwe'
condition_name = eligible_df['Name'] == 'Rulof'

eligible_df[condition_name & condition_surname]

Unnamed: 0,Name,Surname,IDNumber,ID Type,DateOfBirth(yyyy/mm/dd),Gender,DisabilityStatus,Equity,Nationality,ResidentialStatus,...,Physical Province,Postal Address 1,Postal Address 2,Postal Address 3,Postal Post Code,Postal Province,Emis Number,Last School Year,Area Code,Agree Popi Act
282,Rulof,van der Merwe,2105279083,National ID,2000/02/10,Male,,White,South African,South Africa,...,Western Cape,1 Park Close,Table View,Bloubergstrand,7441,Western Cape,113310206,2018,2011-173003003,Yes


In [207]:
for column in eligible_df.columns:
    print(f"{column} : {eligible_df[condition_surname & condition_name][column].values[0]}")

Name : Rulof
Surname : van der Merwe
IDNumber : 0002105279083
ID Type : National ID
DateOfBirth(yyyy/mm/dd) : 2000/02/10
Gender : Male
DisabilityStatus : None
Equity : White
Nationality : South African
ResidentialStatus : South Africa
Language : Afrikaans
MaritalStatus : Single
Province : Western Cape
GETC Qualification : GRADE 12
Municipality : 7441 Cape Town Metro Bloubergstrand Milnerton
Physical Address 1 : 19 Park Close
Physical Address 2 : Table view
Physical  Address 3 : Bloubergstrand
Physical Post Code : 7441
Physical Province : Western Cape
Postal Address 1 : 1 Park Close
Postal Address 2 : Table View
Postal  Address 3 :  Bloubergstrand
Postal Post Code : 7441
Postal Province : Western Cape
Emis Number : 113310206
Last School Year : 2018
Area Code : 2011-173003003
Agree Popi Act : Yes


In [141]:
eligible_df = pd.read_excel("ineligible.xlsx")