In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 

In [72]:
data = pd.read_csv("../raw_data/final_file.csv",low_memory=False)

In [73]:
X = data.drop("Age abandoned school", axis=1)

In [74]:
y = data["Age abandoned school"]

In [75]:
X.head()

Unnamed: 0.1,Unnamed: 0,hhid,Gender,Mother still living,Father still living,"Age, in years",Marital Status,darija,class_arabic,french,...,province,schoolunitid,type_unit,groupid,num_menage,done_test,digit_recognition_res,number_recognition_res,subtraction_res,division_res
0,0,A208004,1.0,1.0,1.0,35.0,1.0,1.0,2.0,2.0,...,Essaouira,A208,Secteur Scolaire Centre,2,4,1.0,1.0,2.0,2.0,2.0
1,1,A426002,1.0,1.0,2.0,33.0,1.0,1.0,2.0,2.0,...,Essaouira,A426,Secteur Scolaire Centre,4,2,2.0,,,,
2,2,A268006,1.0,1.0,1.0,31.0,1.0,1.0,2.0,2.0,...,Chichaoua,A268,Satellite,1,6,1.0,1.0,1.0,2.0,2.0
3,3,A419004,1.0,2.0,2.0,62.0,1.0,1.0,2.0,2.0,...,Taroudant,A419,Secteur Scolaire Centre,2,4,2.0,,,,
4,4,A536006,1.0,2.0,1.0,36.0,1.0,1.0,1.0,2.0,...,Essaouira,A536,Satellite,1,6,1.0,1.0,1.0,1.0,2.0


In [76]:
X = X.rename(columns={'Member knows to read in at least one language':'read_one_lang',
                    'Number of persons living in house':'num_per_house',
                    'Type of housing':'type_housing',
                     'Age, in years':'parents_age',
                     'Mother still living':'mother_alive',
                     'Father still living':'father_alive',
                     'Marital Status':'marital_status'})

In [77]:
X.columns

Index(['Unnamed: 0', 'hhid', 'Gender', 'mother_alive', 'father_alive',
       'parents_age', 'marital_status', 'darija', 'class_arabic', 'french',
       'amazygh', 'read_one_lang', 'write_one_lang', 'no_read_write',
       'parents_level_ed', 'work_activity', 'num_per_house', 'type_housing',
       'automobiles', 'mobile_phones', 'satellite', 'no_water',
       'individual_water_net', 'electrical_net_co', 'school_id',
       'child_enrollment', 'class_when_dropout', 'region', 'province',
       'schoolunitid', 'type_unit', 'groupid', 'num_menage', 'done_test',
       'digit_recognition_res', 'number_recognition_res', 'subtraction_res',
       'division_res'],
      dtype='object')

In [6]:
X.isna().sum().sort_values(ascending = False).head(10)/len(X)

subtraction_res           0.268190
number_recognition_res    0.264197
digit_recognition_res     0.263310
division_res              0.261757
read_one_lang             0.041925
no_read_write             0.039264
write_one_lang            0.039264
work_activity             0.035714
father_alive              0.030612
marital_status            0.029281
dtype: float64

## Imputer

In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean") 
imputer.fit(X[['read_one_lang','no_read_write','write_one_lang','father_alive', 'mother_alive',"darija","class_arabic",'french','amazygh']]) 
X[['read_one_lang','no_read_write','write_one_lang','father_alive', 'mother_alive',"darija","class_arabic",'french','amazygh']] = imputer.transform(X[['read_one_lang','no_read_write','write_one_lang','father_alive', 'mother_alive',"darija","class_arabic",'french','amazygh']])


In [8]:
X.isna().sum().sort_values(ascending = False).head(10)/len(X)

subtraction_res           0.268190
number_recognition_res    0.264197
digit_recognition_res     0.263310
division_res              0.261757
work_activity             0.035714
marital_status            0.029281
parents_level_ed          0.026176
parents_age               0.021961
automobiles               0.010648
satellite                 0.007764
dtype: float64

## Dropping missing values (Marital Status)

In [10]:
X["marital_status"].value_counts(dropna=False)

1.0    4278
NaN     132
4.0      80
3.0      12
2.0       6
Name: marital_status, dtype: int64

In [48]:
X['marital_status'].replace('NaN', np.nan, inplace=True)
X['marital_status'].dropna(inplace=True)

In [49]:
X["marital_status"].value_counts(dropna=False)

1.0    4278
NaN     132
4.0      80
3.0      12
2.0       6
Name: marital_status, dtype: int64

In [55]:
X['marital_status'] = pd.to_numeric(X['marital_status'],errors='coerce')

In [57]:
X = X.dropna(subset = ['marital_status'])

In [58]:
X["marital_status"].value_counts(dropna=False)

1.0    4278
4.0      80
3.0      12
2.0       6
Name: marital_status, dtype: int64

In [59]:
X.isna().sum().sort_values(ascending = False).head(10)/len(X)

subtraction_res           0.266453
number_recognition_res    0.262340
digit_recognition_res     0.261426
division_res              0.259826
work_activity             0.020338
automobiles               0.009598
parents_level_ed          0.009369
parents_age               0.009141
satellite                 0.007084
mobile_phones             0.006856
dtype: float64

In [64]:
X["subtraction_res"].value_counts(dropna=False)

2.0    1788
1.0    1422
NaN    1166
Name: subtraction_res, dtype: int64

In [66]:
X["type_housing"].value_counts(dropna=False)

3.0    1630
1.0    1438
2.0     853
4.0     414
NaN      25
5.0      16
Name: type_housing, dtype: int64

In [67]:
X["num_per_house"].value_counts(dropna=False)

6.0     986
7.0     883
5.0     763
8.0     597
9.0     366
4.0     340
10.0    170
11.0    109
3.0      60
1.0      17
12.0     16
NaN      14
14.0     13
13.0     10
15.0      7
2.0       6
18.0      5
17.0      4
16.0      3
21.0      2
24.0      2
0.0       1
22.0      1
19.0      1
Name: num_per_house, dtype: int64

In [68]:
X["automobiles"].value_counts(dropna=False)

2.0    4143
1.0     191
NaN      42
Name: automobiles, dtype: int64