In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('school_attend.csv')

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [3]:
df

Unnamed: 0,Sleeping_place_fixed_or_not,Division,Work_hour,Age,Eating_location,Aware_of_organizations,school_attend
0,Yes,Dhaka,upto 8hr,from 11 to 14,Street Shop,Yes,No
1,Yes,Dhaka,more than 12 hr,more than 14,Street Shop,No,No
2,Yes,Dhaka,upto 8hr,from 11 to 14,Begging or collecting from dustbin,Yes,Yes
3,Yes,Dhaka,from 8 to 12 hr,less than 11,Begging or collecting from dustbin,Yes,Yes
4,Yes,Dhaka,upto 8hr,from 11 to 14,Street Shop,Yes,Yes
5,Yes,Dhaka,upto 8hr,from 11 to 14,Street Shop,Yes,Yes
6,Yes,Dhaka,upto 8hr,from 11 to 14,Begging or collecting from dustbin,Yes,No
7,Yes,Dhaka,more than 12 hr,more than 14,Street Shop,No,No
8,No,Dhaka,upto 8hr,less than 11,Street Shop,Yes,Yes
9,Yes,Dhaka,more than 12 hr,more than 14,Street Shop,No,No


# Creating & Manageing Dummy Variables

In [4]:
df.columns.values

array(['Sleeping_place_fixed_or_not', 'Division', 'Work_hour', 'Age',
       'Eating_location', 'Aware_of_organizations', 'school_attend'],
      dtype=object)

In [5]:
dummy_df_Age = pd.get_dummies(df['Age'], prefix = 'Age')
dummy_df_Age = dummy_df_Age.astype(int)

In [6]:
dummy_df_Eat_loc = pd.get_dummies(df['Eating_location'], prefix = 'Eating_location')
dummy_df_Eat_loc = dummy_df_Eat_loc.astype(int)

In [7]:
dummy_df_Sleeping_place_fixed = pd.get_dummies(df['Sleeping_place_fixed_or_not'], prefix = 'Sleeping_place_fixed')
dummy_df_Sleeping_place_fixed = dummy_df_Sleeping_place_fixed.astype(int)

In [8]:
dummy_df_Aware_of_organizations = pd.get_dummies(df['Aware_of_organizations'], prefix = 'Aware_of_organizations')
dummy_df_Aware_of_organizations = dummy_df_Aware_of_organizations.astype(int)

In [9]:
dummy_df_Work_hour = pd.get_dummies(df['Work_hour'], prefix = 'Work_hour')
dummy_df_Work_hour = dummy_df_Work_hour.astype(int)

In [10]:
dummy_df_Division = pd.get_dummies(df['Division'], prefix = 'Division')
dummy_df_Division = dummy_df_Division.astype(int)

In [11]:
inputs= pd.concat([dummy_df_Age,dummy_df_Eat_loc,dummy_df_Sleeping_place_fixed,dummy_df_Aware_of_organizations,dummy_df_Work_hour, dummy_df_Division],axis =1)

In [12]:
inputs.columns.values

array(['Age_from 11  to 14', 'Age_less than 11', 'Age_more than 14',
       'Eating_location_Begging or collecting from dustbin',
       'Eating_location_Street Shop', 'Sleeping_place_fixed_No',
       'Sleeping_place_fixed_Yes', 'Aware_of_organizations_No',
       'Aware_of_organizations_Yes', 'Work_hour_from 8 to 12 hr',
       'Work_hour_more than 12 hr', 'Work_hour_upto 8hr',
       'Division_Barisal', 'Division_Chittagong', 'Division_Dhaka',
       'Division_Khulna', 'Division_Rajshahi', 'Division_Sylhet'],
      dtype=object)

In [13]:
dummies_to_drop= ['Age_from 11  to 14','Eating_location_Street Shop','Sleeping_place_fixed_Yes','Aware_of_organizations_Yes','Work_hour_upto 8hr','Division_Dhaka']

# Defining Input Dataset

In [14]:
inputs= inputs.drop(dummies_to_drop,axis=1)

In [15]:
inputs.shape

(450, 12)

# Defining Target Dataset

In [16]:
targets  = df.filter(['school_attend'])

In [17]:
targets = targets.replace({'Yes' : 1, 'No' : 0})
targets = targets.astype(int)

In [18]:
targets.value_counts()

school_attend
0                273
1                177
Name: count, dtype: int64

In [19]:
targets

Unnamed: 0,school_attend
0,0
1,0
2,1
3,1
4,1
5,1
6,0
7,0
8,1
9,0


# Split into Train and Test

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(inputs,targets,train_size = .8,shuffle = True, random_state = 20)

# Fitting Logistic Regression Model

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
reg = LogisticRegression()

In [23]:
reg.fit(x_train,y_train)

In [24]:
score_train= reg.score(x_train,y_train)
score_test = reg.score(x_test,y_test)
print('\n Train score :', score_train *100 ,'%')
print('\n Test score :', score_test *100 ,'%')


 Train score : 66.11111111111111 %

 Test score : 66.66666666666666 %


In [25]:
# Predictions
logistic_prediction = reg.predict(x_test)

# Calculate accuracy
logistic_accuracy = accuracy_score(y_test, logistic_prediction)

# Print classification report
logistic_classification_report = classification_report(y_test, logistic_prediction)
print(f"logistic Accuracy: {logistic_accuracy}")
print(f"logistic Classification Report:\n{logistic_classification_report}")

logistic Accuracy: 0.6666666666666666
logistic Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.81      0.75        57
           1       0.56      0.42      0.48        33

    accuracy                           0.67        90
   macro avg       0.63      0.62      0.62        90
weighted avg       0.65      0.67      0.65        90



In [26]:
type(logistic_classification_report)

str

# Fitting SVM

In [27]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)  # You can choose different kernels and hyperparameters
svm_classifier.fit(x_train, y_train)

In [28]:
# Predictions
svm_predictions = svm_classifier.predict(x_test)
# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_predictions)
# Print classification report
svm_classification_report = classification_report(y_test, svm_predictions)

In [29]:
print(f"SVM Accuracy: {svm_accuracy}")
print(f"SVM Classification Report:\n{svm_classification_report}")

SVM Accuracy: 0.6888888888888889
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.91      0.79        57
           1       0.67      0.30      0.42        33

    accuracy                           0.69        90
   macro avg       0.68      0.61      0.60        90
weighted avg       0.68      0.69      0.65        90



# Fitting Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees and other hyperparameters
rf_classifier.fit(x_train, y_train)

In [31]:
#Predictions
rf_predictions = rf_classifier.predict(x_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Print classification report
rf_classification_report = classification_report(y_test, rf_predictions)

In [32]:
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest Classification Report:\n{rf_classification_report}")

Random Forest Accuracy: 0.5888888888888889
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.70      0.68        57
           1       0.43      0.39      0.41        33

    accuracy                           0.59        90
   macro avg       0.55      0.55      0.55        90
weighted avg       0.58      0.59      0.58        90



# Fitting Binary Tree

In [33]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_classifier = DecisionTreeClassifier(random_state=42, max_depth=None)  # You can adjust the max_depth and other hyperparameters
decision_tree_classifier.fit(x_train, y_train)

In [34]:
# Predictions
tree_predictions = decision_tree_classifier.predict(x_test)

# Calculate accuracy
tree_accuracy = accuracy_score(y_test, tree_predictions)

# Print classification report
tree_classification_report = classification_report(y_test, tree_predictions)

In [35]:
print(f"Decision Tree Accuracy: {tree_accuracy}")
print(f"Decision Tree Classification Report:\n{tree_classification_report}")

Decision Tree Accuracy: 0.6111111111111112
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.75      0.71        57
           1       0.46      0.36      0.41        33

    accuracy                           0.61        90
   macro avg       0.57      0.56      0.56        90
weighted avg       0.59      0.61      0.60        90



# Fitting Gradient Boost

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators and other hyperparameters
gradient_boosting_classifier.fit(x_train, y_train)

In [37]:
# Predictions
gb_predictions = gradient_boosting_classifier.predict(x_test)

# Calculate accuracy
gb_accuracy = accuracy_score(y_test, gb_predictions)

# Print classification report
gb_classification_report = classification_report(y_test, gb_predictions)

print(f"Gradient Boosting Accuracy: {gb_accuracy}")
print(f"Gradient Boosting Classification Report:\n{gb_classification_report}")

Gradient Boosting Accuracy: 0.6111111111111112
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.74      0.71        57
           1       0.46      0.39      0.43        33

    accuracy                           0.61        90
   macro avg       0.57      0.57      0.57        90
weighted avg       0.60      0.61      0.60        90



# LogisticRegression Model Modification & Interpretation

In [38]:
inputs.columns.values

array(['Age_less than 11', 'Age_more than 14',
       'Eating_location_Begging or collecting from dustbin',
       'Sleeping_place_fixed_No', 'Aware_of_organizations_No',
       'Work_hour_from 8 to 12 hr', 'Work_hour_more than 12 hr',
       'Division_Barisal', 'Division_Chittagong', 'Division_Khulna',
       'Division_Rajshahi', 'Division_Sylhet'], dtype=object)

In [39]:
feature_names = ['Age_less than 11', 'Age_more than 14',
       'Eating_location_Begging or collecting from dustbin',
       'Sleeping_place_fixed_No', 'Aware_of_organizations_No',
       'Work_hour_from 8 to 12 hr', 'Work_hour_more than 12 hr',
       'Division_Barisal', 'Division_Chittagong', 'Division_Khulna',
       'Division_Rajshahi', 'Division_Sylhet']

In [40]:
table = pd.DataFrame(columns =['Feature_names'],data = feature_names)

In [41]:
table['Coefficients'] = np.transpose(reg.coef_)

In [42]:
new_record = {'Feature_names': 'Intercept', 'Coefficients': -1.02698795 }
table.loc[-1] = new_record
table.index = table.index + 1
table = table.sort_index()

In [43]:
table['Odds Ratio'] = np.exp(table.Coefficients)
table 

Unnamed: 0,Feature_names,Coefficients,Odds Ratio
0,Intercept,-1.026988,0.358084
1,Age_less than 11,-0.125271,0.882258
2,Age_more than 14,-0.579767,0.560029
3,Eating_location_Begging or collecting from dus...,0.322248,1.380228
4,Sleeping_place_fixed_No,-0.387409,0.678813
5,Aware_of_organizations_No,-1.01101,0.363851
6,Work_hour_from 8 to 12 hr,0.774729,2.170004
7,Work_hour_more than 12 hr,-0.027089,0.973274
8,Division_Barisal,-0.221527,0.801294
9,Division_Chittagong,-0.204099,0.815381


In [44]:
table.sort_values('Odds Ratio', ascending = False)

Unnamed: 0,Feature_names,Coefficients,Odds Ratio
11,Division_Rajshahi,0.941861,2.56475
6,Work_hour_from 8 to 12 hr,0.774729,2.170004
3,Eating_location_Begging or collecting from dus...,0.322248,1.380228
7,Work_hour_more than 12 hr,-0.027089,0.973274
10,Division_Khulna,-0.089406,0.914474
1,Age_less than 11,-0.125271,0.882258
9,Division_Chittagong,-0.204099,0.815381
8,Division_Barisal,-0.221527,0.801294
4,Sleeping_place_fixed_No,-0.387409,0.678813
2,Age_more than 14,-0.579767,0.560029


In [45]:
dummies_to_drop_nn = ['Work_hour_more than 12 hr','Division_Khulna','Age_less than 11']

In [46]:
inputs_nn = inputs.drop(dummies_to_drop_nn, axis =1)

In [47]:
#inputs_nn.to_csv('inputs_schhol.csv',index = False)
#targets.to_csv('targets_school.csv',index = False)

In [56]:
table = table.sort_values('Odds Ratio', ascending = False)

In [49]:
#table.to_csv('table.csv', index = False)

In [57]:
table

Unnamed: 0,Feature_names,Coefficients,Odds Ratio
11,Division_Rajshahi,0.941861,2.56475
6,Work_hour_from 8 to 12 hr,0.774729,2.170004
3,Eating_location_Begging or collecting from dus...,0.322248,1.380228
7,Work_hour_more than 12 hr,-0.027089,0.973274
10,Division_Khulna,-0.089406,0.914474
1,Age_less than 11,-0.125271,0.882258
9,Division_Chittagong,-0.204099,0.815381
8,Division_Barisal,-0.221527,0.801294
4,Sleeping_place_fixed_No,-0.387409,0.678813
2,Age_more than 14,-0.579767,0.560029


In [61]:
# Calculate Wald confidence intervals
import scipy.stats as stats
coefficients = reg.coef_.flatten()
std_errors = np.sqrt(np.diag(np.linalg.inv(np.dot(x_train.T, x_train))))

alpha = 0.05
z_critical = stats.norm.ppf(1 - alpha / 2)

lower_bound = coefficients - z_critical * std_errors
upper_bound = coefficients + z_critical * std_errors

# Display or store results
results = pd.DataFrame({'Coefficient': np.exp(coefficients), 'Lower Bound': np.exp(lower_bound), 'Upper Bound': np.exp(upper_bound)})
print(results)

    Coefficient  Lower Bound  Upper Bound
0      0.882258     0.683894     1.138158
1      0.560029     0.422972     0.741496
2      1.380228     1.002229     1.900792
3      0.678813     0.510062     0.903395
4      0.363851     0.294318     0.449812
5      2.170004     1.737208     2.710623
6      0.973274     0.664571     1.425375
7      0.801294     0.499517     1.285388
8      0.815381     0.592562     1.121986
9      0.914474     0.577268     1.448657
10     2.564750     1.821763     3.610757
11     0.307479     0.216279     0.437135


In [62]:
results = results.sort_values('Coefficient', ascending = False)

In [64]:
#results.to_csv('CI.csv', index = False)