In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Performing Transformations to prep for ML Models.

In [8]:
student_data = pd.read_csv('data/student-mat.csv',delimiter=';')

In [9]:
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [10]:
# Feature Engineering Examples

# 1. Total Alcohol Consumption
student_data['TotalAlc'] = student_data['Dalc'] + student_data['Walc']

# 2. Bin 'age' into age groups
student_data['AgeGroup'] = pd.cut(student_data['age'], bins=[0, 15, 17, 20], labels=['14-15', '16-17', '18+'])

# 3. Transform 'absences' into categorical bins
student_data['AbsenceCategory'] = pd.cut(student_data['absences'], bins=[-1, 5, 10, 30, float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])

# 4. Interaction Term: Study Time and Failures
student_data['StudytimeXFailures'] = student_data['studytime'] * student_data['failures']

# 5. Logarithmic transformation of a skewed variable
# First, identify a skewed variable (using 'absences' as an example)
student_data['LogAbsences'] = np.log1p(student_data['absences'])

# Displaying the first few rows of the modified dataset
student_data[['TotalAlc', 'AgeGroup', 'AbsenceCategory', 'StudytimeXFailures', 'LogAbsences']].head()


Unnamed: 0,TotalAlc,AgeGroup,AbsenceCategory,StudytimeXFailures,LogAbsences
0,2,18+,Medium,0,1.94591
1,2,16-17,Low,0,1.609438
2,5,14-15,Medium,6,2.397895
3,2,14-15,Low,0,1.098612
4,3,16-17,Low,0,1.609438


In [12]:
student_data.to_csv('data/student_data_encoded.csv',index_label=False)