In [5]:
import pandas as pd
import numpy  as np

# Read dataset csv
graduates_df = pd.read_csv("o-bidang-2013.csv")

# Get all the "Public University" in the dataset
ua_lst = graduates_df['UA'].unique().tolist()

# Set the university to be the index of the dataset
graduates_df.set_index("UA", inplace=True)

In [6]:
# Show 5 rows of the dataframe
print(graduates_df.shape)
graduates_df.head()

(151, 2)


Unnamed: 0_level_0,BIDANG,Output2013
UA,Unnamed: 1_level_1,Unnamed: 2_level_1
UM,"KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN",914
UM,KESIHATAN DAN KEBAJIKAN,742
UM,PENDIDIKAN,466
UM,PERKHIDMATAN,72
UM,PROGRAM ASAS,0


In [13]:
# Get all the Study Majors in the dataset
bidang_lst = graduates_df['BIDANG'].unique().tolist()

# Show the Majors 
bidang_lst

['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN',
 'KESIHATAN DAN KEBAJIKAN',
 'PENDIDIKAN',
 'PERKHIDMATAN',
 'PROGRAM ASAS',
 'SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN',
 'SAINS, MATEMATIK DAN KOMPUTER',
 'SASTERA DAN KEMANUSIAAN',
 'KESELURUHAN',
 'PERTANIAN DAN VETERINAR']

In [14]:
# Translation from Bahasa Malaysia to English
trans_dict = {}
trans_dict['KESELURUHAN'] = 'Total'
trans_dict['PENDIDIKAN'] = 'Education'
trans_dict['PERKHIDMATAN'] = 'Service'
trans_dict['PROGRAM ASAS'] = 'Basic Program'
trans_dict['KESIHATAN DAN KEBAJIKAN'] = 'Health and Welfare'
trans_dict['SASTERA DAN KEMANUSIAAN'] = 'Literature and Humanity'
trans_dict['PERTANIAN DAN VETERINAR'] = 'Agriculture and Veterinary'
trans_dict['SAINS, MATEMATIK DAN KOMPUTER'] = 'Science, Maths, and Computer'
trans_dict['SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN'] = 'Social Science, Business, and Laws'
trans_dict['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN'] = 'Engineering, Manufacturing and Construction'

field_lst = ['Engineering, Manufacturing and Construction', 
             'Health and Welfare',
             'Education',
             'Service',
             'Agriculture and Veterinary',
             'Basic Program',
             'Social Science, Business, and Laws',
             'Science, Maths, and Computer',
             'Literature and Humanity',
             'Total']

In [15]:
# Show all the universities 
print(len(ua_lst))
ua_lst

20


['UM',
 'UTHM',
 'UTeM',
 'UMT',
 'UMP',
 'UniSZA',
 'USM',
 'UNIMAP',
 'UPM',
 'USIM',
 'UiTM',
 'UTM',
 'UIAM',
 'UUM',
 'UMS',
 'UPSI',
 'UNIMAS',
 'UKM',
 'UMK',
 'UPNM']

In [16]:
# Manually insert the state location of the universities for quick manipulation
# There must be a better way to handle this but this way is quicker.
uni_dict = {}

uni_dict['UM'] = "W.P. Kuala Lumpur"
uni_dict['UTHM'] = "Johor"
uni_dict['UTeM'] = "Melaka"
uni_dict['UMT'] = "Terengganu"
uni_dict['UMP'] = "Pahang"
uni_dict['UniSZA'] = "Terengganu"
uni_dict['USM'] = "Pulau Pinang"
uni_dict['UNIMAP'] = "Perlis"
uni_dict['UPM'] = "Selangor"
uni_dict['USIM'] = "Negeri Sembilan"
uni_dict['UiTM'] = "Selangor"
uni_dict['UTM'] = "Johor"
uni_dict['UIAM'] = "W.P. Kuala Lumpur"
uni_dict['UUM'] = "Kedah"
uni_dict['UMS'] = "Sabah"
uni_dict['UPSI'] = "Perak"
uni_dict['UNIMAS'] = "Sarawak"
uni_dict['UKM'] = "Selangor"
uni_dict['UMK'] = "Kelantan"
uni_dict['UPNM'] = "W.P. Kuala Lumpur"

print(len(uni_dict))

20


In [17]:
# Initialize a new dictionary
new_dict = {}

# Insert the 'output' values (number of graduates) into the dictionary 
# accordingly
for bidang in bidang_lst:
    number_grads = {}
    current_field_df = graduates_df[graduates_df['BIDANG'] == bidang]
    for university in ua_lst:
        if (university in current_field_df.index.tolist()):
            value = current_field_df.loc[university, 'Output2013']
        else:
            value = 0
        number_grads[university] = value
    new_dict[trans_dict.get(bidang)] = number_grads

# Show the dictionary
print(len(new_dict))
print(new_dict.get('Literature and Humanity'))


10
{'UM': 605, 'UTHM': 0, 'UTeM': 0, 'UMT': 11, 'UMP': 0, 'UniSZA': 423, 'USM': 692, 'UNIMAP': 0, 'UPM': 860, 'USIM': 737, 'UiTM': 4203, 'UTM': 39, 'UIAM': 1109, 'UUM': 51, 'UMS': 286, 'UPSI': 70, 'UNIMAS': 278, 'UKM': 981, 'UMK': 147, 'UPNM': 0}


In [18]:
# Create new dataframe
processed_grad_df = pd.DataFrame()

# Create a new column 'Public University'
processed_grad_df['Public University'] = ua_lst

# Set the university to be the index of the dataframe
processed_grad_df.set_index("Public University", inplace=True)

# Initialize the column "State" and set its values to 0
processed_grad_df['State'] = 0

# Insert the state locations of universities into the dataframe
for university in ua_lst:
    value = uni_dict.get(university)
    processed_grad_df.loc[university, 'State'] = value

# Initialize all the columns and all the values to 0
for field in field_lst:
    processed_grad_df[field] = 0

# Set the value accordingly based on university and field
for university in ua_lst:
    for field in field_lst:
        value = new_dict.get(field).get(university)
        processed_grad_df.loc[university, field] = value
    
# Show the newly processed dataframe    
processed_grad_df 

Unnamed: 0_level_0,State,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,Basic Program,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
Public University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
UM,W.P. Kuala Lumpur,914,742,466,72,0,0,1115,1612,605,5526
UTHM,Johor,2262,0,764,0,0,0,168,332,0,3526
UTeM,Melaka,1488,0,0,0,0,0,175,129,0,1792
UMT,Terengganu,116,204,0,254,317,0,386,680,11,1968
UMP,Pahang,1317,0,0,27,0,0,116,265,0,1725
UniSZA,Terengganu,56,105,206,0,69,0,507,70,423,1436
USM,Pulau Pinang,1576,1034,813,41,0,0,1410,793,692,6359
UNIMAP,Perlis,1555,0,0,0,0,0,315,0,0,1870
UPM,Selangor,1406,298,938,61,1545,35,1377,1205,860,7725
USIM,Negeri Sembilan,87,102,68,0,0,0,452,53,737,1499


In [19]:
# Group rows by State since information about universities is not needed
grouped_grad_df = processed_grad_df.groupby('State').sum()

# Drop the column 'Basic Program' since there are no values in it
grouped_grad_df = grouped_grad_df.drop('Basic Program', 1)

# Drop any row that contains NaN values 
preprocessed_grad_df = grouped_grad_df.dropna()

# Show the preprocessed df
print(preprocessed_grad_df.shape)
preprocessed_grad_df

(14, 9)


Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Johor,5773,0,1780,71,0,679,1227,39,9569
Kedah,0,106,876,280,0,5558,529,51,7400
Kelantan,66,22,0,64,74,78,84,147,535
Melaka,1488,0,0,0,0,175,129,0,1792
Negeri Sembilan,87,102,68,0,0,452,53,737,1499
Pahang,1317,0,0,27,0,116,265,0,1725
Perak,0,397,5456,0,0,14,0,70,5937
Perlis,1555,0,0,0,0,315,0,0,1870
Pulau Pinang,1576,1034,813,41,0,1410,793,692,6359
Sabah,478,181,164,156,123,1836,539,286,3763


In [21]:
# Give every row a new value of 2013 under the column 'Year' to 
# so that the data won't be confused with other data from another year.
preprocessed_grad_df['Year'] = 2013

# Rename the preprocessed_grad_df to avoid confusion in the dataset
# integration stage
preprocessed_grad_2013_df = preprocessed_grad_df.copy()

# Show the dataframe
preprocessed_grad_2013_df

Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total,Year
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Johor,5773,0,1780,71,0,679,1227,39,9569,2013
Kedah,0,106,876,280,0,5558,529,51,7400,2013
Kelantan,66,22,0,64,74,78,84,147,535,2013
Melaka,1488,0,0,0,0,175,129,0,1792,2013
Negeri Sembilan,87,102,68,0,0,452,53,737,1499,2013
Pahang,1317,0,0,27,0,116,265,0,1725,2013
Perak,0,397,5456,0,0,14,0,70,5937,2013
Perlis,1555,0,0,0,0,315,0,0,1870,2013
Pulau Pinang,1576,1034,813,41,0,1410,793,692,6359,2013
Sabah,478,181,164,156,123,1836,539,286,3763,2013
