# **Lab Four: The Multi-Layer Perceptron**


### **Load, Split, and Balance**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop, Adadelta



In [3]:
# Load the dataset
url = "https://www.dropbox.com/s/bf7i7qjftk7cmzq/acs2017_census_tract_data.csv?dl=1"
data = pd.read_csv(url)

# Display basic info
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74001 entries, 0 to 74000
Data columns (total 37 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TractId           74001 non-null  int64  
 1   State             74001 non-null  object 
 2   County            74001 non-null  object 
 3   TotalPop          74001 non-null  int64  
 4   Men               74001 non-null  int64  
 5   Women             74001 non-null  int64  
 6   Hispanic          73305 non-null  float64
 7   White             73305 non-null  float64
 8   Black             73305 non-null  float64
 9   Native            73305 non-null  float64
 10  Asian             73305 non-null  float64
 11  Pacific           73305 non-null  float64
 12  VotingAgeCitizen  74001 non-null  int64  
 13  Income            72885 non-null  float64
 14  IncomeErr         72885 non-null  float64
 15  IncomePerCap      73256 non-null  float64
 16  IncomePerCapErr   73256 non-null  float6

In [4]:
# Remove rows with missing values
data.dropna(inplace=True)

# Encode categorical variables
data['State'] = data['State'].astype('category').cat.codes

# Option to drop 'County'
data.drop(['County'], axis=1, inplace=True)

# Check for null values
print(data.isnull().sum())

TractId             0
State               0
TotalPop            0
Men                 0
Women               0
Hispanic            0
White               0
Black               0
Native              0
Asian               0
Pacific             0
VotingAgeCitizen    0
Income              0
IncomeErr           0
IncomePerCap        0
IncomePerCapErr     0
Poverty             0
ChildPoverty        0
Professional        0
Service             0
Office              0
Construction        0
Production          0
Drive               0
Carpool             0
Transit             0
Walk                0
OtherTransp         0
WorkAtHome          0
MeanCommute         0
Employed            0
PrivateWork         0
PublicWork          0
SelfEmployed        0
FamilyWork          0
Unemployment        0
dtype: int64


Removed county because it might not be a predictive feature and could introduce noise.

In [5]:
# Create quantile-based bins
data['ChildPovertyClass'] = pd.qcut(data['ChildPoverty'], q=4, labels=[0, 1, 2, 3])

# Drop the original continuous target
data.drop(['ChildPoverty'], axis=1, inplace=True)

Quantization: Divide the ChildPoverty variable into four levels.

In [7]:
from imblearn.over_sampling import RandomOverSampler

X = data.drop('ChildPovertyClass', axis=1)
y = data['ChildPovertyClass']

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)

# Verify balancing
print(y_res.value_counts())


ChildPovertyClass
0    18229
1    18229
2    18229
3    18229
Name: count, dtype: int64




Balance the classes

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

Split into training and testing sets: 80% training, 20% testing.

### **Pre-processing and Initial Modeling**

In [9]:
model = Sequential()
model.add(Dense(128, activation='sigmoid', kernel_initializer='glorot_uniform', input_dim=X_train.shape[1]))
model.add(Dense(64, activation='sigmoid', kernel_initializer='glorot_uniform'))
model.add(Dense(4, activation='softmax'))  # 4 classes

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Plot loss vs epochs
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: category

In [10]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)


In [11]:
history_norm = model.fit(X_train_norm, y_train, epochs=50, batch_size=32, validation_data=(X_test_norm, y_test))

ValueError: Invalid dtype: category

In [13]:
from tensorflow.keras.utils import to_categorical

y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)

# Train on normalized and one-hot encoded data
history_ohe = model.fit(X_train_norm, y_train_ohe, epochs=50, batch_size=32, validation_data=(X_test_norm, y_test_ohe))


Epoch 1/50


ValueError: Argument `output` must have rank (ndim) `target.ndim - 1`. Received: target.shape=(None, 4), output.shape=(None, 4)

In [14]:
def plot_comparison(histories, labels):
    plt.figure(figsize=(12, 5))
    
    for i, history in enumerate(histories):
        plt.plot(history.history['accuracy'], label=f'Train {labels[i]}')
        plt.plot(history.history['val_accuracy'], label=f'Validation {labels[i]}')
    
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

plot_comparison([history, history_norm, history_ohe], ['No Norm', 'Norm', 'Norm + OHE'])


NameError: name 'history' is not defined

### **Modeling**

In [15]:
model_3 = Sequential()
model_3.add(Dense(128, activation='sigmoid', kernel_initializer='glorot_uniform', input_dim=X_train_norm.shape[1]))
model_3.add(Dense(64, activation='sigmoid'))
model_3.add(Dense(32, activation='sigmoid'))
model_3.add(Dense(4, activation='softmax'))

model_3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history_3 = model_3.fit(X_train_norm, y_train, epochs=50, batch_size=32, validation_data=(X_test_norm, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Invalid dtype: category

In [16]:
# 4-layer MLP
model_4 = Sequential()
model_4.add(Dense(256, activation='relu'))
model_4.add(Dense(128, activation='relu'))
model_4.add(Dense(64, activation='relu'))
model_4.add(Dense(32, activation='relu'))
model_4.add(Dense(4, activation='softmax'))

# 5-layer MLP
model_5 = Sequential()
model_5.add(Dense(512, activation='relu'))
model_5.add(Dense(256, activation='relu'))
model_5.add(Dense(128, activation='relu'))
model_5.add(Dense(64, activation='relu'))
model_5.add(Dense(32, activation='relu'))
model_5.add(Dense(4, activation='softmax'))