In [81]:
import pandas as pd
import numpy  as np

# Read dataset csv
graduates_df = pd.read_csv("o-bidang-2014.csv")

# Get all the "Public University" in the dataset
ua_lst = graduates_df['UA'].unique().tolist()

# Set the university to be the index of the dataset
graduates_df.set_index("UA", inplace=True)

In [82]:
# Show 5 rows of the dataframe
print(graduates_df.shape)
graduates_df.head()

(153, 2)


Unnamed: 0_level_0,Bidang,Output2014
UA,Unnamed: 1_level_1,Unnamed: 2_level_1
UM,"KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN",986.0
UM,KESIHATAN DAN KEBAJIKAN,750.0
UM,PENDIDIKAN,458.0
UM,PERKHIDMATAN,54.0
UM,PROGRAM ASAS,


In [83]:
# Get all the Study Majors in the dataset
bidang_lst = graduates_df['Bidang'].unique().tolist()

# Show the Majors 
bidang_lst

['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN',
 'KESIHATAN DAN KEBAJIKAN',
 'PENDIDIKAN',
 'PERKHIDMATAN',
 'PROGRAM ASAS',
 'SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN',
 'SAINS, MATEMATIK DAN KOMPUTER',
 'SASTERA DAN KEMANUSIAAN',
 'KESELURUHAN',
 'PERTANIAN DAN VETERINAR']

In [84]:
# Translation from Bahasa Malaysia to English
trans_dict = {}
trans_dict['KESELURUHAN'] = 'Total'
trans_dict['PENDIDIKAN'] = 'Education'
trans_dict['PERKHIDMATAN'] = 'Service'
trans_dict['PROGRAM ASAS'] = 'Basic Program'
trans_dict['KESIHATAN DAN KEBAJIKAN'] = 'Health and Welfare'
trans_dict['SASTERA DAN KEMANUSIAAN'] = 'Literature and Humanity'
trans_dict['PERTANIAN DAN VETERINAR'] = 'Agriculture and Veterinary'
trans_dict['SAINS, MATEMATIK DAN KOMPUTER'] = 'Science, Maths, and Computer'
trans_dict['SAINS SOSIAL, PERNIAGAAN DAN PERUNDANGAN'] = 'Social Science, Business, and Laws'
trans_dict['KEJURUTERAAN, PEMBUATAN DAN PEMBINAAN'] = 'Engineering, Manufacturing and Construction'

field_lst = ['Engineering, Manufacturing and Construction', 
             'Health and Welfare',
             'Education',
             'Service',
             'Agriculture and Veterinary',
             'Basic Program',
             'Social Science, Business, and Laws',
             'Science, Maths, and Computer',
             'Literature and Humanity',
             'Total']

In [85]:
# Show all the universities 
print(len(ua_lst))
ua_lst

20


['UM',
 'UTHM',
 'UTeM',
 'UMT',
 'UMP',
 'UniSZA',
 'USM',
 'UNIMAP',
 'UPM',
 'USIM',
 'UiTM',
 'UTM',
 'UIAM',
 'UUM',
 'UMS',
 'UPSI',
 'UNIMAS',
 'UKM',
 'UMK',
 'UPNM']

In [86]:
# Manually insert the state location of the universities for quick manipulation
# There must be a better way to handle this but this way is quicker.
uni_dict = {}

uni_dict['UM'] = "W.P. Kuala Lumpur"
uni_dict['UTHM'] = "Johor"
uni_dict['UTeM'] = "Melaka"
uni_dict['UMT'] = "Terengganu"
uni_dict['UMP'] = "Pahang"
uni_dict['UniSZA'] = "Terengganu"
uni_dict['USM'] = "Pulau Pinang"
uni_dict['UNIMAP'] = "Perlis"
uni_dict['UPM'] = "Selangor"
uni_dict['USIM'] = "Negeri Sembilan"
uni_dict['UiTM'] = "Selangor"
uni_dict['UTM'] = "Johor"
uni_dict['UIAM'] = "W.P. Kuala Lumpur"
uni_dict['UUM'] = "Kedah"
uni_dict['UMS'] = "Sabah"
uni_dict['UPSI'] = "Perak"
uni_dict['UNIMAS'] = "Sarawak"
uni_dict['UKM'] = "Selangor"
uni_dict['UMK'] = "Kelantan"
uni_dict['UPNM'] = "W.P. Kuala Lumpur"

print(len(uni_dict))

20


In [87]:
# Initialize a new dictionary
new_dict = {}

# Insert the 'output' values (number of graduates) into the dictionary 
# accordingly
for bidang in bidang_lst:
    number_grads = {}
    current_field_df = graduates_df[graduates_df['Bidang'] == bidang]
    for university in ua_lst:
        if (university in current_field_df.index.tolist()):
            value = current_field_df.loc[university, 'Output2014']
        else:
            value = 0
        number_grads[university] = value
    new_dict[trans_dict.get(bidang)] = number_grads

# Show the dictionary
print(len(new_dict))
print(new_dict.get('Literature and Humanity'))


10
{'UM': 1020.0, 'UTHM': 0, 'UTeM': 0, 'UMT': nan, 'UMP': 2.0, 'UniSZA': 719.0, 'USM': 714.0, 'UNIMAP': nan, 'UPM': 403.0, 'USIM': 847.0, 'UiTM': 4497.0, 'UTM': 62.0, 'UIAM': 1655.0, 'UUM': 84.0, 'UMS': 303.0, 'UPSI': 266.0, 'UNIMAS': 299.0, 'UKM': 915.0, 'UMK': 130.0, 'UPNM': 0}


In [88]:
# Create new dataframe
processed_grad_df = pd.DataFrame()

# Create a new column 'Public University'
processed_grad_df['Public University'] = ua_lst

# Set the university to be the index of the dataframe
processed_grad_df.set_index("Public University", inplace=True)

# Initialize the column "State" and set its values to 0
processed_grad_df['State'] = 0

# Insert the state locations of universities into the dataframe
for university in ua_lst:
    value = uni_dict.get(university)
    processed_grad_df.loc[university, 'State'] = value

# Initialize all the columns and all the values to 0
for field in field_lst:
    processed_grad_df[field] = 0

# Set the value accordingly based on university and field
for university in ua_lst:
    for field in field_lst:
        value = new_dict.get(field).get(university)
        processed_grad_df.loc[university, field] = value
    
# Show the newly processed dataframe    
processed_grad_df 

Unnamed: 0_level_0,State,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,Basic Program,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
Public University,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
UM,W.P. Kuala Lumpur,986.0,750.0,458.0,54.0,0.0,,1432.0,1934.0,1020.0,6634.0
UTHM,Johor,2827.0,0.0,170.0,0.0,0.0,0.0,204.0,266.0,0.0,3467.0
UTeM,Melaka,1603.0,0.0,0.0,0.0,0.0,0.0,224.0,161.0,0.0,1988.0
UMT,Terengganu,117.0,220.0,0.0,167.0,375.0,0.0,378.0,747.0,,2004.0
UMP,Pahang,1473.0,0.0,0.0,36.0,0.0,0.0,175.0,362.0,2.0,2048.0
UniSZA,Terengganu,63.0,144.0,66.0,0.0,55.0,0.0,810.0,93.0,719.0,1950.0
USM,Pulau Pinang,1577.0,1143.0,498.0,54.0,0.0,,1569.0,826.0,714.0,6381.0
UNIMAP,Perlis,1473.0,,0.0,0.0,,,325.0,4.0,,1802.0
UPM,Selangor,1269.0,283.0,748.0,76.0,1438.0,0.0,1289.0,1150.0,403.0,6656.0
USIM,Negeri Sembilan,166.0,269.0,7.0,0.0,0.0,0.0,792.0,370.0,847.0,2451.0


In [89]:
# Group rows by State since information about universities is not needed
grouped_grad_df = processed_grad_df.groupby('State').sum()

# Drop the column 'Basic Program' since there are no values in it
grouped_grad_df = grouped_grad_df.drop('Basic Program', 1)

# Drop any row that contains NaN values 
preprocessed_grad_df = grouped_grad_df.dropna()

# Show the preprocessed df
print(preprocessed_grad_df.shape)
preprocessed_grad_df

(14, 9)


Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Johor,6641.0,0.0,896.0,114.0,0.0,1117.0,1701.0,62.0,10531.0
Kedah,0.0,77.0,345.0,212.0,0.0,5417.0,467.0,84.0,6602.0
Kelantan,76.0,33.0,0.0,64.0,92.0,101.0,74.0,130.0,570.0
Melaka,1603.0,0.0,0.0,0.0,0.0,224.0,161.0,0.0,1988.0
Negeri Sembilan,166.0,269.0,7.0,0.0,0.0,792.0,370.0,847.0,2451.0
Pahang,1473.0,0.0,0.0,36.0,0.0,175.0,362.0,2.0,2048.0
Perak,0.0,131.0,3603.0,0.0,0.0,20.0,306.0,266.0,4326.0
Perlis,1473.0,0.0,0.0,0.0,0.0,325.0,4.0,0.0,1802.0
Pulau Pinang,1577.0,1143.0,498.0,54.0,0.0,1569.0,826.0,714.0,6381.0
Sabah,480.0,176.0,193.0,172.0,172.0,1909.0,580.0,303.0,3985.0


In [91]:
# Give every row a new value of 2014 under the column 'Year' to 
# so that the data won't be confused with other data from another year.
preprocessed_grad_df['Year'] = 2014

# Rename the preprocessed_grad_df to avoid confusion in the dataset
# integration stage
preprocessed_grad_2014_df = preprocessed_grad_df.copy()

# Show the dataframe
preprocessed_grad_2014_df

Unnamed: 0_level_0,"Engineering, Manufacturing and Construction",Health and Welfare,Education,Service,Agriculture and Veterinary,"Social Science, Business, and Laws","Science, Maths, and Computer",Literature and Humanity,Total,Year
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Johor,6641.0,0.0,896.0,114.0,0.0,1117.0,1701.0,62.0,10531.0,2014
Kedah,0.0,77.0,345.0,212.0,0.0,5417.0,467.0,84.0,6602.0,2014
Kelantan,76.0,33.0,0.0,64.0,92.0,101.0,74.0,130.0,570.0,2014
Melaka,1603.0,0.0,0.0,0.0,0.0,224.0,161.0,0.0,1988.0,2014
Negeri Sembilan,166.0,269.0,7.0,0.0,0.0,792.0,370.0,847.0,2451.0,2014
Pahang,1473.0,0.0,0.0,36.0,0.0,175.0,362.0,2.0,2048.0,2014
Perak,0.0,131.0,3603.0,0.0,0.0,20.0,306.0,266.0,4326.0,2014
Perlis,1473.0,0.0,0.0,0.0,0.0,325.0,4.0,0.0,1802.0,2014
Pulau Pinang,1577.0,1143.0,498.0,54.0,0.0,1569.0,826.0,714.0,6381.0,2014
Sabah,480.0,176.0,193.0,172.0,172.0,1909.0,580.0,303.0,3985.0,2014
