# Data Pre-processing

In [2]:
import pandas as pd 


In [3]:
# Combines data from two CSV files into a single DataFrame.
data = pd.concat([pd.read_csv('./student/student-mat.csv', sep=';'), 
                  pd.read_csv('./student/student-por.csv', sep=';')], 
                  ignore_index=True)

data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
1040,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
1041,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
1042,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


### Removing inconsistent data

In [4]:
# Removing school, guardian from the 'data'

data = data.drop(columns=['school', 'guardian'])

### Encoding non-numeric data

In [5]:
data_columns_to_encode = ['sex',
                          'address',
                          'famsize',
                          'Pstatus',
                          'Mjob',
                          'Fjob',
                          'reason',
                          'schoolsup',
                          'famsup',
                          'paid',
                          'activities',
                          'nursery',
                          'higher',
                          'internet',
                          'romantic',
                          ]

In [8]:
def encode(df: pd.DataFrame, 
           columns: list) -> tuple[pd.DataFrame, dict]:
    
    encode_vocab = {}

    for col in columns:
        
        vocab   = df[col].unique().tolist()
        df[col] = df[col].apply(lambda x: vocab.index(x))

        encode_vocab[col] = vocab
    
    return df, encode_vocab

In [9]:
x = encode(data, data_columns_to_encode)
print(x)

(     sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  reason  \
0      F   18       U     GT3       A     4     4   at_home   teacher  course   
1      F   17       U     GT3       T     1     1   at_home     other  course   
2      F   15       U     LE3       T     1     1   at_home     other   other   
3      F   15       U     GT3       T     4     2    health  services    home   
4      F   16       U     GT3       T     3     3     other     other    home   
...   ..  ...     ...     ...     ...   ...   ...       ...       ...     ...   
1039   F   19       R     GT3       T     2     3  services     other  course   
1040   F   18       U     LE3       T     3     1   teacher  services  course   
1041   F   18       U     GT3       T     1     1     other     other  course   
1042   M   17       U     LE3       T     3     1  services  services  course   
1043   M   18       R     LE3       T     3     2  services     other  course   

      ...  famrel  freetim