# Processing of the data based on the EDA

In [40]:
# Imports
import pandas as pd

In [41]:
# Read the initial data
data = pd.read_csv('data/data.csv', sep=";")

# Combine the 'Enrolled' and 'Dropout' class in order to balance the data
data.loc[data['Target'] != 'Graduate', 'Target'] = 'NotGraduate'
data['Target'] = data['Target'].replace({'Graduate':1, 'NotGraduate':0})

# Transforming the columns headers to something for python friendly
formatted_data = data
formatted_data.columns = [c.replace(' ', '_').replace('\'', '').replace('(', '').replace(')', '').replace('/', '_').strip()
                          for c in formatted_data.columns]


In [42]:
# Feature Engineering
semester_cols = formatted_data.iloc[:,[21,22,23,24,25,26,27,28,29,30,31,32]]
pca = PCA(n_components=5)
pca.fit(semester_cols)
principal_components = pca.transform(semester_cols)
pc_df = pd.DataFrame(data=principal_components, columns=['CU1', 'CU2', 'CU3', 'CU4', 'CU5'])

# Drop the original columns
formatted_data.drop(columns=semester_cols.columns, axis=1, inplace=True)

# Concatenate the new columns to the dataset
formatted_data = pd.concat([data, pc_df], axis=1)

In [43]:
formatted_data.to_csv('data/formatted_data.csv', sep=';', index=False)

formatted_data

Unnamed: 0,Marital_status,Application_mode,Application_order,Course,Daytime_evening_attendance,Previous_qualification,Previous_qualification_grade,Nacionality,Mothers_qualification,Fathers_qualification,...,International,Unemployment_rate,Inflation_rate,GDP,Target,CU1,CU2,CU3,CU4,CU5
0,1,17,5,171,1,1,122.0,1,19,12,...,0,10.8,1.4,1.74,0,-21.116404,-0.951593,2.556893,-0.680228,0.526105
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,13.9,-0.3,0.79,1,2.514829,-5.255275,1.815774,0.666776,0.041471
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,10.8,1.4,1.74,0,-18.826232,1.829553,5.332664,-0.205698,-0.782959
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,9.4,-0.8,-3.12,1,3.313201,-2.063552,-1.150530,0.047209,-1.458712
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,13.9,-0.3,0.79,1,2.076791,-2.879578,0.560910,0.220850,2.235950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,15.5,2.8,-4.06,1,2.211839,-3.460857,-0.198407,0.433131,-0.666139
4420,1,1,2,9773,1,1,120.0,105,1,1,...,1,11.1,0.6,2.02,0,-0.987310,-3.030782,0.704106,1.313926,-0.402473
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,13.9,-0.3,0.79,0,3.934767,-2.642186,-1.425250,1.567264,-1.314400
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,9.4,-0.8,-3.12,1,0.240643,-5.324014,1.059441,1.251771,-0.722761


## Pre-processing features

| Action                                      | Status  |
|---------------------------------------------|---------|
| Merge target dropout with enrolled          | &#9745; |
| Target To binary (0,1)                      | &#9745; |
| Drop Nationality                            | &#9746; |
| Drop Marital Status                         | &#9746; |
| Drop Previous Qualification                 | &#9746; |
| Normalize Previous Qualification grade      | &#63;   |
| Split mean Previous Qualification grade     | &#63;   |
| Investigate contribution Inflation Rate     | &#63;   |
| Investigate contribution  GDP               | &#63;   |
| Investigate contribution  Unemployment Rate | &#63;   |
| Investigate contribution  Admission Grade   | &#63;   |
| Feature Eng. Application Mode               | &#9746; |
| drop Educational Special Needs              | &#9746; |
| Feature Eng. Courses                        | &#9746; |
| drop Daytime/Evening                        | &#9746; |
| drop International                          | &#9746; |
| Combine Mother-Father qualification         | &#9746; |
| Combine Mother-Father occupation            | &#9746; |
| Combine curricular units                    | &#9746; |
| Application Order to dummies                | &#63;   |
| drop Application Order                      | &#63;   |
| drop / investigate contribution Debtor      | &#63;   |
| drop / investigate Tuition Fees up to date  | &#63;   |
| drop / investigate Gender                   | &#63;   |
| Feature Eng. curricular units               | &#63;   |
| Feature Eng. Age at Enrollment              | &#63;   |

