In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [9]:
data = pd.read_csv("../raw_data/final_file_with_all_features.csv",low_memory=False)

## Variables 

In [10]:
X = data.drop("Age abandoned school", axis=1)
y = data["Age abandoned school"]

In [48]:
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Age abandoned school, dtype: float64

In [11]:
X.head()

Unnamed: 0,hhid,Gender,mother_alive,father_alive,parents_age,marital_status,darija,class_arabic,french,amazygh,...,child_enrollment,class_when_dropout,region,province,schoolunitid,type_unit,groupid,num_menage,done_test,work_activity_classified
0,A208004,1.0,1.0,1.0,35.0,1.0,1.0,2.0,2.0,2.0,...,1.0,0.0,Marrakech - Tensift - Al Haouz,Essaouira,A208,Secteur Scolaire Centre,2,4,1.0,1.0
1,A426002,1.0,1.0,2.0,33.0,1.0,1.0,2.0,2.0,1.0,...,1.0,0.0,Marrakech - Tensift - Al Haouz,Essaouira,A426,Secteur Scolaire Centre,4,2,2.0,0.0
2,A268006,1.0,1.0,1.0,31.0,1.0,1.0,2.0,2.0,1.0,...,1.0,0.0,Marrakech - Tensift - Al Haouz,Chichaoua,A268,Satellite,1,6,1.0,0.0
3,A419004,1.0,2.0,2.0,62.0,1.0,1.0,2.0,2.0,1.0,...,1.0,0.0,Sous - Massa - Draa,Taroudant,A419,Secteur Scolaire Centre,2,4,2.0,0.0
4,A536006,1.0,2.0,1.0,36.0,1.0,1.0,1.0,2.0,1.0,...,1.0,0.0,Marrakech - Tensift - Al Haouz,Essaouira,A536,Satellite,1,6,1.0,1.0


In [12]:
X.columns

Index(['hhid', 'Gender', 'mother_alive', 'father_alive', 'parents_age',
       'marital_status', 'darija', 'class_arabic', 'french', 'amazygh',
       'read_one_lang', 'write_one_lang', 'no_read_write', 'parents_level_ed',
       'num_per_house', 'type_housing', 'automobiles', 'mobile_phones',
       'satellite', 'no_water', 'individual_water_net', 'electrical_net_co',
       'school_id', 'child_enrollment', 'class_when_dropout', 'region',
       'province', 'schoolunitid', 'type_unit', 'groupid', 'num_menage',
       'done_test', 'work_activity_classified'],
      dtype='object')

In [15]:
X = X.drop(columns={"type_unit","groupid","num_menage","child_enrollment","class_when_dropout","schoolunitid","school_id","done_test"})

KeyError: "['class_when_dropout', 'child_enrollment', 'type_unit', 'school_id', 'num_menage', 'groupid', 'schoolunitid'] not found in axis"

In [19]:
X.shape

(4052, 26)

In [20]:
X.head()

Unnamed: 0,hhid,Gender,mother_alive,father_alive,parents_age,marital_status,darija,class_arabic,french,amazygh,...,automobiles,mobile_phones,satellite,no_water,individual_water_net,electrical_net_co,region,province,done_test,work_activity_classified
0,A208004,1.0,1.0,1.0,35.0,1.0,1.0,2.0,2.0,2.0,...,2.0,1.0,1.0,2,2,1,Marrakech - Tensift - Al Haouz,Essaouira,1.0,1.0
1,A426002,1.0,1.0,2.0,33.0,1.0,1.0,2.0,2.0,1.0,...,2.0,1.0,2.0,2,2,1,Marrakech - Tensift - Al Haouz,Essaouira,2.0,0.0
2,A268006,1.0,1.0,1.0,31.0,1.0,1.0,2.0,2.0,1.0,...,2.0,1.0,2.0,2,2,2,Marrakech - Tensift - Al Haouz,Chichaoua,1.0,0.0
3,A419004,1.0,2.0,2.0,62.0,1.0,1.0,2.0,2.0,1.0,...,2.0,1.0,1.0,2,2,1,Sous - Massa - Draa,Taroudant,2.0,0.0
4,A536006,1.0,2.0,1.0,36.0,1.0,1.0,1.0,2.0,1.0,...,2.0,1.0,2.0,2,2,2,Marrakech - Tensift - Al Haouz,Essaouira,1.0,1.0


## Pipeline building

In [41]:
# test/train split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_train_num = X.select_dtypes(include=["float64","int64"])
X_train_cat = X[["region","province"]]

In [42]:
# Numerical Features

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())])


In [43]:
# Categorical features 

cat_transformer = Pipeline([
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [49]:
# Build our preprocessor

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, X_train_num.columns),
    ('cat_transformer', cat_transformer, X_train_cat.columns)])


In [50]:
X_train_preprocessed = preprocessor.fit_transform(X_train)

In [51]:
feature_names = preprocessor.named_transformers_['cat_transformer']\
    .named_steps['one_hot_encoder'].get_feature_names_out(X_train_cat.columns)

# Create a new DataFrame with preprocessed features and updated column names
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=list(X_train_num.columns) + list(feature_names))

In [52]:
X_train_preprocessed_df.head()

Unnamed: 0,Gender,mother_alive,father_alive,parents_age,marital_status,darija,class_arabic,french,amazygh,read_one_lang,...,province_Essaouira,province_Ifrane,province_Jerada,province_Khenifra,province_Meknes,province_Nador,province_Ouarzazate,province_Taourirt,province_Taroudant,province_Tiznit
0,-0.122609,1.171246,0.80709,-0.19882,-0.134571,2.235869,0.393341,0.158533,-0.562365,0.599624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.122609,1.171246,0.80709,0.4205,-0.134571,-0.448492,0.393341,0.158533,-0.562365,0.599624,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.122609,1.171246,0.80709,0.62694,-0.134571,-0.448492,0.393341,0.158533,-0.562365,0.599624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.122609,-0.859071,-1.260832,-1.437459,-0.134571,-0.448492,-2.549416,0.158533,-0.562365,-1.702131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.122609,-0.859071,-1.260832,-1.024579,-0.134571,2.235869,0.393341,0.158533,-0.562365,0.599624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
