# Pipeline

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, balanced_accuracy_score
import warnings

In [2]:
df = pd.read_csv(r'C:\Users\ASUS\Downloads\archive\validation_ordinal_enco.csv')
df.head()

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType,Locale,TestScore_Math,TestScore_Reading,...,GPA,AttendanceRate,StudyHours,InternetAccess,Extracurricular,PartTimeJob,ParentSupport,Romantic,FreeTime,GoOut
0,14,9,Female,Hispanic,3,SomeCollege,Public,City,90.186244,88.390291,...,3.509322,0.964022,1.4097,1,1,0,0,0,1,4
1,17,12,Female,White,4,SomeCollege,Public,Rural,73.356297,73.471428,...,3.118458,0.882988,1.112296,1,0,0,1,0,3,3
2,17,12,Female,Other,2,HS,Public,Suburban,73.970459,76.085029,...,3.640436,0.864401,1.214927,1,1,1,1,1,1,1
3,15,10,Male,White,4,SomeCollege,Public,Suburban,64.50029,63.858629,...,2.990611,0.887791,0.155258,1,0,0,1,0,5,4
4,18,12,Male,Black,3,HS,Public,City,84.546267,88.502675,...,3.546262,0.933201,0.908776,1,1,0,0,1,3,3


In [3]:
ndf = df.iloc[:,:7]

In [4]:
ndf.head()

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType
0,14,9,Female,Hispanic,3,SomeCollege,Public
1,17,12,Female,White,4,SomeCollege,Public
2,17,12,Female,Other,2,HS,Public
3,15,10,Male,White,4,SomeCollege,Public
4,18,12,Male,Black,3,HS,Public


In [5]:
ndf.shape

(999229, 7)

In [6]:
ndf.isnull().sum()

Age                  0
Grade                0
Gender               0
Race                 0
SES_Quartile         0
ParentalEducation    0
SchoolType           0
dtype: int64

In [7]:
ndf.head()

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType
0,14,9,Female,Hispanic,3,SomeCollege,Public
1,17,12,Female,White,4,SomeCollege,Public
2,17,12,Female,Other,2,HS,Public
3,15,10,Male,White,4,SomeCollege,Public
4,18,12,Male,Black,3,HS,Public


In [11]:
ordinal_cols = ['Grade', 'SES_Quartile', 'ParentalEducation']
ordinal_categories = [
    [9, 10, 11, 12],  # Integers
    [1, 2, 3, 4],     # Integers
    ['<HS','HS','SomeCollege','Bachelors+']  # Strings as is
]

nominal_cols = ['Gender', 'Race']
numeric_cols = ['Age']


In [13]:
ndf['Grade'] = ndf['Grade'].astype(int)
ndf['SES_Quartile'] = ndf['SES_Quartile'].astype(int)

In [14]:
ordinal_pipe = Pipeline([
    ('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_pipe = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

numeric_pipe = Pipeline([
    ('scaler', StandardScaler())  # Optional, but useful for some models
])

In [15]:
preprocessor = ColumnTransformer([
    ('ord', ordinal_pipe, ordinal_cols),
    ('nom', nominal_pipe, nominal_cols),
    ('num', numeric_pipe, numeric_cols)
])


In [16]:
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [19]:
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(ndf.drop(columns=['SchoolType']), ndf['SchoolType'], test_size=0.2, random_state=42)

In [21]:
warnings.filterwarnings("ignore")

In [22]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8463967254786186
              precision    recall  f1-score   support

     Private       0.00      0.00      0.00     30697
      Public       0.85      1.00      0.92    169149

    accuracy                           0.85    199846
   macro avg       0.42      0.50      0.46    199846
weighted avg       0.72      0.85      0.78    199846



In [25]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8463967254786186
              precision    recall  f1-score   support

     Private       0.00      0.00      0.00     30697
      Public       0.85      1.00      0.92    169149

    accuracy                           0.85    199846
   macro avg       0.42      0.50      0.46    199846
weighted avg       0.72      0.85      0.78    199846

