In [12]:
import pandas as pd
import numpy  as np

# Read dataset csv
graduates_df = pd.read_csv("o-bidang-2015.csv")

# Get all the "Public University" in the dataset
ua_lst = graduates_df['UA'].unique().tolist()

# Set the university to be the index of the dataset
graduates_df.set_index("UA", inplace=True)

In [13]:
# Show 5 rows of the dataframe
print(graduates_df.shape)
graduates_df.head()

(200, 2)


Unnamed: 0_level_0,Bidang,Output 2015
UA,Unnamed: 1_level_1,Unnamed: 2_level_1
UM,"KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN",931
UM,KESIHATAN DAN KEBAJIKAN,806
UM,PENDIDIKAN,374
UM,PERKHIDMATAN,48
UM,PERTANIAN DAN VETERINAR,0


In [14]:
# Get all the Study Majors in the dataset
bidang_lst = graduates_df['Bidang'].unique().tolist()

# Show the Majors 
bidang_lst

['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN',
 'KESIHATAN DAN KEBAJIKAN',
 'PENDIDIKAN',
 'PERKHIDMATAN',
 'PERTANIAN DAN VETERINAR',
 'PROGRAM ASAS',
 'SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN',
 'SAINS, MATEMATIK DAN KOMPUTER',
 'SASTERA DAN KEMANUSIAAN',
 'KESELURUHAN']

In [15]:
# Translation from Bahasa Malaysia to English
trans_dict = {}
trans_dict['KESELURUHAN'] = 'Total'
trans_dict['PENDIDIKAN'] = 'Education'
trans_dict['PERKHIDMATAN'] = 'Service'
trans_dict['PROGRAM ASAS'] = 'Basic Program'
trans_dict['KESIHATAN DAN KEBAJIKAN'] = 'Health and Welfare'
trans_dict['SASTERA DAN KEMANUSIAAN'] = 'Literature and Humanity'
trans_dict['PERTANIAN DAN VETERINAR'] = 'Agriculture and Veterinary'
trans_dict['SAINS, MATEMATIK DAN KOMPUTER'] = 'Science, Maths, and Computer'
trans_dict['SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN'] = 'Social Science, Business, and Laws'
trans_dict['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN'] = 'Engineering, Manufacturing and Construction'

field_lst = ['Engineering, Manufacturing and Construction', 
             'Health and Welfare',
             'Education',
             'Service',
             'Agriculture and Veterinary',
             'Basic Program',
             'Social Science, Business, and Laws',
             'Science, Maths, and Computer',
             'Literature and Humanity',
             'Total']

In [16]:
# Show all the universities 
print(len(ua_lst))
ua_lst

20


['UM',
 'UTHM',
 'UTeM',
 'UMT',
 'UMP',
 'UniSZA',
 'USM',
 'UNIMAP',
 'UPM',
 'USIM',
 'UiTM',
 'UTM',
 'UIAM',
 'UUM',
 'UMS',
 'UPSI',
 'UNIMAS',
 'UKM',
 'UMK',
 'UPNM']

In [17]:
# Manually insert the state location of the universities for quick manipulation
# There must be a better way to handle this but this way is quicker.
uni_dict = {}

uni_dict['UM'] = "W.P. Kuala Lumpur"
uni_dict['UTHM'] = "Johor"
uni_dict['UTeM'] = "Melaka"
uni_dict['UMT'] = "Terengganu"
uni_dict['UMP'] = "Pahang"
uni_dict['UniSZA'] = "Terengganu"
uni_dict['USM'] = "Pulau Pinang"
uni_dict['UNIMAP'] = "Perlis"
uni_dict['UPM'] = "Selangor"
uni_dict['USIM'] = "Negeri Sembilan"
uni_dict['UiTM'] = "Selangor"
uni_dict['UTM'] = "Johor"
uni_dict['UIAM'] = "W.P. Kuala Lumpur"
uni_dict['UUM'] = "Kedah"
uni_dict['UMS'] = "Sabah"
uni_dict['UPSI'] = "Perak"
uni_dict['UNIMAS'] = "Sarawak"
uni_dict['UKM'] = "Selangor"
uni_dict['UMK'] = "Kelantan"
uni_dict['UPNM'] = "W.P. Kuala Lumpur"

print(len(uni_dict))

20


In [18]:
# Initialize a new dictionary
new_dict = {}

# Insert the 'output' values (number of graduates) into the dictionary 
# accordingly
for bidang in bidang_lst:
    number_grads = {}
    current_field_df = graduates_df[graduates_df['Bidang'] == bidang]
    for university in ua_lst:
        if (university in current_field_df.index.tolist()):
            value = current_field_df.loc[university, 'Output 2015']
        else:
            value = 0
        number_grads[university] = value
    new_dict[trans_dict.get(bidang)] = number_grads

# Show the dictionary
print(len(new_dict))
print(new_dict.get('Literature and Humanity'))


10
{'UM': 974, 'UTHM': 0, 'UTeM': 0, 'UMT': 0, 'UMP': 4, 'UniSZA': 808, 'USM': 718, 'UNIMAP': 0, 'UPM': 763, 'USIM': 10, 'UiTM': 3397, 'UTM': 33, 'UIAM': 741, 'UUM': 69, 'UMS': 321, 'UPSI': 497, 'UNIMAS': 311, 'UKM': 612, 'UMK': 256, 'UPNM': 0}


In [19]:
# Create new dataframe
processed_grad_df = pd.DataFrame()

# Create a new column 'Public University'
processed_grad_df['Public University'] = ua_lst

# Set the university to be the index of the dataframe
processed_grad_df.set_index("Public University", inplace=True)

# Initialize the column "State" and set its values to 0
processed_grad_df['State'] = 0

# Insert the state locations of universities into the dataframe
for university in ua_lst:
    value = uni_dict.get(university)
    processed_grad_df.loc[university, 'State'] = value

# Initialize all the columns and all the values to 0
for field in field_lst:
    processed_grad_df[field] = 0

# Set the value accordingly based on university and field
for university in ua_lst:
    for field in field_lst:
        value = new_dict.get(field).get(university)
        processed_grad_df.loc[university, field] = value
    
# Show the newly processed dataframe    
processed_grad_df 

Unnamed: 0_level_0,State,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,Basic Program,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
Public University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
UM,W.P. Kuala Lumpur,931,806,374,48,0,0,1345,1850,974,6328
UTHM,Johor,2799,0,846,0,0,0,277,440,0,4362
UTeM,Melaka,1782,0,0,0,0,0,220,426,0,2428
UMT,Terengganu,123,236,0,401,370,0,395,792,0,2317
UMP,Pahang,1544,0,0,70,0,0,190,314,4,2122
UniSZA,Terengganu,75,200,343,0,79,0,763,132,808,2400
USM,Pulau Pinang,1703,1229,575,58,0,0,1591,869,718,6743
UNIMAP,Perlis,2062,0,0,0,0,0,376,14,0,2452
UPM,Selangor,1269,313,895,93,1212,0,1231,1306,763,7082
USIM,Negeri Sembilan,11,183,3,0,0,0,382,304,10,893


In [20]:
# Group rows by State since information about universities is not needed
grouped_grad_df = processed_grad_df.groupby('State').sum()

# Drop the column 'Basic Program' since there are no values in it
grouped_grad_df = grouped_grad_df.drop('Basic Program', 1)

# Drop any row that contains NaN values 
preprocessed_grad_df = grouped_grad_df.dropna()

# Show the preprocessed df
print(preprocessed_grad_df.shape)
preprocessed_grad_df

(14, 9)


Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Johor,6723,0,1589,101,0,1167,1746,33,11359
Kedah,0,98,959,205,0,4856,522,69,6709
Kelantan,97,50,0,149,115,275,120,256,1062
Melaka,1782,0,0,0,0,220,426,0,2428
Negeri Sembilan,11,183,3,0,0,382,304,10,893
Pahang,1544,0,0,70,0,190,314,4,2122
Perak,0,445,9871,1,0,90,903,497,11807
Perlis,2062,0,0,0,0,376,14,0,2452
Pulau Pinang,1703,1229,575,58,0,1591,869,718,6743
Sabah,609,161,149,173,113,1877,661,321,4064


In [22]:
# Give every row a new value of 2015 under the column 'Year' to 
# so that the data won't be confused with other data from another year.
preprocessed_grad_df['Year'] = 2015

# Rename the preprocessed_grad_df to avoid confusion in the dataset
# integration stage
preprocessed_grad_2015_df = preprocessed_grad_df.copy()

# Show the dataframe
preprocessed_grad_2015_df

Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total,Year
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Johor,6723,0,1589,101,0,1167,1746,33,11359,2015
Kedah,0,98,959,205,0,4856,522,69,6709,2015
Kelantan,97,50,0,149,115,275,120,256,1062,2015
Melaka,1782,0,0,0,0,220,426,0,2428,2015
Negeri Sembilan,11,183,3,0,0,382,304,10,893,2015
Pahang,1544,0,0,70,0,190,314,4,2122,2015
Perak,0,445,9871,1,0,90,903,497,11807,2015
Perlis,2062,0,0,0,0,376,14,0,2452,2015
Pulau Pinang,1703,1229,575,58,0,1591,869,718,6743,2015
Sabah,609,161,149,173,113,1877,661,321,4064,2015
