# Data Pre-processing

In [10]:
import pandas as pd 
import json
from sklearn.preprocessing import StandardScaler

In [4]:
# Combines data from two CSV files into a single DataFrame.
data = pd.concat([pd.read_csv('./student/student-mat.csv', sep=';'), 
                  pd.read_csv('./student/student-por.csv', sep=';')], 
                  ignore_index=True)

data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
1040,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
1041,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
1042,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


### Removing inconsistent data

In [5]:
# Removing school, guardian from the 'data'

data = data.drop(columns=['school', 'guardian'])

### Encoding non-numeric data

In [6]:
data_columns_to_encode = ['sex',
                          'address',
                          'famsize',
                          'Pstatus',
                          'Mjob',
                          'Fjob',
                          'reason',
                          'schoolsup',
                          'famsup',
                          'paid',
                          'activities',
                          'nursery',
                          'higher',
                          'internet',
                          'romantic',
                          ]

In [7]:
def encode_dataframe(dataframe: pd.DataFrame, columns: list) -> tuple[pd.DataFrame, dict]:
    """
    Encodes specified columns of a DataFrame into numeric values based on their unique values.

    Parameters:
        dataframe (pd.DataFrame): The DataFrame to encode.
        columns (list): A list of column names to be encoded.

    Returns:
        tuple: A tuple containing:
            - The updated DataFrame with specified columns encoded.
            - A dictionary where each key is a column name and the value is the vocabulary (list of unique values).
    """
    encoding_vocab = {}

    for column in columns:
        # Create a vocabulary (list of unique values) for the current column
        unique_values = dataframe[column].unique().tolist()

        # Create a mapping from unique values to their indices
        value_to_index = {value: idx for idx, value in enumerate(unique_values)}

        # Apply the mapping to the column for encoding
        dataframe[column] = dataframe[column].map(value_to_index)

        # Store the vocabulary for this column
        encoding_vocab[column] = unique_values

    return dataframe, encoding_vocab


### Decode the encoded DataFrame

In [11]:
def decode_dataframe(dataframe: pd.DataFrame, encoding_vocab: dict) -> pd.DataFrame:
    """
    Decodes specified columns of a DataFrame from numeric values to their original values.

    Parameters:
        dataframe (pd.DataFrame): The DataFrame to decode.
        encoding_vocab (dict): A dictionary where each key is a column name and the value is the vocabulary (list of unique values).

    Returns:
        pd.DataFrame: The updated DataFrame with specified columns decoded.
    """
    for column, vocab in encoding_vocab.items():
        # Create a mapping from indices to unique values
        index_to_value = {idx: value for idx, value in enumerate(vocab)}

        # Apply the mapping to the column for decoding
        dataframe[column] = dataframe[column].map(index_to_value)

    return dataframe

In [8]:
# Encode the specified columns in the DataFrame and get the lookup table
data, lookup_table = encode_dataframe(data, data_columns_to_encode)

# Save the lookup table to a JSON file for future reference
with open('lookup_table.json', 'w') as file:
    json.dump(lookup_table, file, indent=4)

In [13]:
data

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,18,0,0,0,4,4,0,0,0,...,4,3,4,1,1,3,6,5,6,6
1,0,17,0,0,1,1,1,0,1,0,...,5,3,3,1,1,3,4,5,5,6
2,0,15,0,1,1,1,1,0,1,1,...,4,3,2,2,3,3,10,7,8,10
3,0,15,0,0,1,4,2,1,2,2,...,3,2,2,1,1,5,2,15,14,15
4,0,16,0,0,1,3,3,2,1,2,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,0,19,1,0,1,2,3,3,1,0,...,5,4,2,1,2,5,4,10,11,10
1040,0,18,0,1,1,3,1,4,2,0,...,4,3,4,1,1,1,4,15,15,16
1041,0,18,0,0,1,1,1,2,1,0,...,1,1,1,1,1,5,6,11,12,9
1042,1,17,0,1,1,3,1,3,2,0,...,2,4,5,3,4,2,6,10,10,10


In [15]:
def normalize_data_zscore(df: pd.DataFrame) -> tuple[pd.DataFrame, dict, dict]:
    """
    Normalize the given DataFrame using Z-score (Standardization).
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to be standardized.
        
    Returns:
        tuple: 
            - pd.DataFrame: The standardized DataFrame.
            - dict: A dictionary containing the mean of each column.
            - dict: A dictionary containing the standard deviation of each column.
    """
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Select columns to normalize (only numeric columns)
    columns_to_normalize = df.select_dtypes(include=['int64', 'float64']).columns

    # Fit the scaler and transform the data
    df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

    # Get mean and standard deviation for each column
    column_means = scaler.mean_
    column_stds = scaler.scale_

    # Convert to dictionaries with column names as keys
    mean_dict = {col: column_means[i] for i, col in enumerate(columns_to_normalize)}
    std_dict = {col: column_stds[i] for i, col in enumerate(columns_to_normalize)}

    return df, mean_dict, std_dict

In [None]:
data, mean, std = normalize_data_zscore(data)

data