**This notebook contains a script that creates a csv for loading students onto the SETA LMS**

**Add local library to path**

In [53]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    sys.path.append(module_path + '/local_library')

**Import libraries**

In [54]:
import pandas as pd
from local_library import import_worksheet
from local_library import export_worksheet

**Import from databases**

In [55]:
#import cohort srs data
cohort_df = import_worksheet('Onloading', 'Sheet1')

#import national data for emis
national_df = pd.read_excel('downloads/national.xlsx')

#import area codes data for area code
areacode_df = pd.read_csv('downloads/areacode.csv')

#import example submission sheet
example_df = pd.read_excel('downloads/example.xlsx')

  warn(msg)


**Subset cohort dataframe**

In [56]:
#set subset columns
useful_columns = ['First Name', 'Last Name', 'ID / Passport Number', 'Date Of Birth', 'Gender', 'Disability Type', 'Ethnicity',
                 'Nationality', 'Residential Status', 'Home Language', 'Home Province', 'Home Street', 'Home Suburb', 'Home City',
                 'Home Postal Code', 'Completion Year', 'Name of High School', 'Highest Qualification','Qualification Name', 
                 'Passed Grade 12',]

#subset dataframe
cohort_df = cohort_df[useful_columns]

**Check Eligible Students**

In [57]:
cohort_df = cohort_df[(cohort_df['Passed Grade 12'] == 'Yes') & (~cohort_df['Qualification Name'].str.contains('Comp', na=False))]

**Add NatEmis**

In [58]:
#create high school dataframe
high_schools_df = national_df[['Institution_Name', 'NatEmis']]

#prepare high school name column
high_schools_df['Institution_Name'] = high_schools_df['Institution_Name'].str.lower().str.strip()

#prepare cohort dataframe high school name column
cohort_df['Name of High School'] = cohort_df['Name of High School'].str.lower().str.strip()

#merge
cohort_df = cohort_df.merge(high_schools_df, left_on='Name of High School', right_on='Institution_Name', how='left')

#fill any nulls
cohort_df['NatEmis'].fillna((cohort_df['NatEmis'].median()), inplace=True)

#drop high school names
cohort_df.drop(['Name of High School', 'Institution_Name'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


**Add ID number Type**

In [59]:
cohort_df["ID Type"] = ["National ID" if (len(f"{id_num}") == 13 or len(f"{id_num}") == 12) else "Passport Number" for id_num in cohort_df["ID / Passport Number"]]


**Refactor Home Province**

In [60]:
province_map = {"Western Cape":"Western Cape", "Eastern Cape":"Eastern Cape",
                 "Kwa-Zulu Natal":"Kwazulu-Natal", "KwaZulu-Natal":"Kwazulu-Natal", "Kwazulu Natal":"Kwazulu-Natal",
                 "Kwazulu - Natal":"Kwazulu-Natal", "KZN":"Kwazulu-Natal", "Limpopo":"Limpopo",
                 "Gauteng":"Gauteng", "South Africa":"Gauteng",
                 "South Africa ":"Gauteng", "Guagteng":"Gauteng",
                 "South":"Gauteng", "Gauteng ":"Gauteng", 
                 "Guateng":"Gauteng", "7945":"Gauteng",
                 "Free State":"Free State", "Mpumalanga":"Mpumalanga", "North West":"North West",
                 "Northwest":"North West"}

cohort_df['Home Province'] = cohort_df['Home Province'].replace(province_map)

cohort_df['Home Province'] = cohort_df['Home Province'].fillna('Gauteng')

**Add province**

In [61]:
cohort_df['Province'] = cohort_df['Home Province']

**Add Physical Addresses**

In [62]:
#set street address
cohort_df['Physical Address 1'] = cohort_df['Home Street']

#set suburb adress
cohort_df['Physical Address 2'] = cohort_df['Home Suburb']

#set city address
cohort_df['Physical Address 3'] = cohort_df['Home City']

#set postal code
cohort_df['Physical Post Code'] = cohort_df['Home Postal Code']

#set province address
cohort_df['Physical Province'] = cohort_df['Home Province']

**Add Postal Address**

In [63]:
#set street address
cohort_df['Postal Address 1'] = cohort_df['Home Street']

#set suburb adress
cohort_df['Postal Address 2'] = cohort_df['Home Suburb']

#set city address
cohort_df['Postal Address 3'] = cohort_df['Home City']

#set postal code
cohort_df['Postal Post Code'] = cohort_df['Home Postal Code']

#set province address
cohort_df['Postal Province'] = cohort_df['Home Province']

**Set Area Code**

In [64]:
cohort_df['AreaCode'] = [areacode for areacode in areacode_df['STATSSA_Area_Code'].head(415)]


**Set Municipality**

In [65]:
municipality_map = {"Gauteng":"2017 Johannesburg Metro Braamfontein Johannesburg", 
                    "Kwazulu-Natal":"4450 KwaDukuza Stanger", "Limpopo":"0699 Polokwane Seshego Zone 3 Polokwane",
                   "Western Cape":"7925 Cape Town Metro Observatory Cape Town",
                    "Mpumalanga":"1309 Umjindi Kwa Mhola Barberton", 
                    "Eastern Cape":"5601 Buffalo City Daleview King William's Town",
                   "North West":"2571 Matlosana Klerksdorp", "Free State":"9307 Mangaung Mangaung Bloemfontein",
                    "Northern Cape":"8700 Sol Plaatjie Modder River Kimberley"}

cohort_df['Municipality'] = cohort_df['Province'].replace(municipality_map)

**Set Static Fields**

In [66]:
#set marital status
cohort_df['MaritalStatus'] = "Single"

#set getc qualification
cohort_df['GETC Qualification'] = "GRADE 12"

#set popi act
cohort_df['Agree Popi Act'] = "Yes"

**Reformat Date of Birth**

In [67]:
#convert dob to datetime formatt
cohort_df['DateOfBirth'] = pd.to_datetime(cohort_df['Date Of Birth'])

#convert dob cooulm to yy/mm/dd formatt
cohort_df['DateOfBirth'] = cohort_df['DateOfBirth'].dt.strftime('%Y/%d/%m')

**Reformat Language**

In [68]:
language_map = {'Other':'English', 'seSwati':'siSwati'}

cohort_df['Language'] = cohort_df['Home Language'].replace(language_map)

cohort_df['Language'] = cohort_df['Language'].fillna('English')

**Reformat Equity**

In [69]:
equity_map = {'Black':'Black: African', 'Indian':'Black: Indian/Asian', 'White':'White', 'Coloured':'Black: Coloured', 'Asian':'Black: Indian/Asian'}

cohort_df['Equity'] = cohort_df['Ethnicity'].replace(equity_map)

**Reformat Disability**

In [70]:
disability_map = {'Physical (move/stand)':'Physical (move/stand etc)', 'Emotional (behavioural/psychological)':'Emotional (behav/psych)',
                  'Disabled Not Specified':'Disabled but unspecified'}

cohort_df['DisabilityStatus'] = cohort_df['Disability Type'].replace(disability_map)

cohort_df['DisabilityStatus'] = cohort_df['DisabilityStatus'].fillna('None')

**Reformat Nationality**

In [71]:
cohort_df['Nationality'] = cohort_df['Nationality'].replace({'South Africa':'South African'})

cohort_df['Nationality'] = cohort_df['Nationality'].fillna('South African')

**Reformat Residential Address**

In [75]:
residential_map = {'South African':'South Africa', 'Refugee':'Other', 'Foreign Nation / Non-Citizen':'Other'}

cohort_df['ResidentialStatus'] = cohort_df['Residential Status'].replace(residential_map)

cohort_df['ResidentialStatus'] = cohort_df['ResidentialStatus'].fillna('South Africa')

array(['South Africa', 'Permanent Resident', 'Other'], dtype=object)

**Rename Columns**

In [81]:
columns_map = {'First Name':'Name', 'Last Name':'Surname', 'ID / Passport Number':'IDNumber', 'GETC Qualification':'Qualification',
               'Physical Address 3':'Physical  Address 3', 'Postal Address 3':'Postal  Address 3', 'NatEmis':'Emis Number',
               'Passed Grade 12':'Last School Year', 'AreaCode':'Area Code'}

cohort_df = cohort_df.rename(columns=columns_map)

**Subset Relevant Columns**

In [83]:
cohort_df = cohort_df[example_df.columns]