In [1]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [2]:
# Dependencies
import numpy as np
import pandas as pd

In [3]:
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

In [4]:
df = pd.read_csv("main_data_for_ML.csv")
df.head(10)

Unnamed: 0,id,EOG_quintile,percent_EDS,child_abuse_rate,children_conc_pov,parent,juv_delinquent,No_HSdegree,parent_unemployed,county_poc,county_tier,poverty_county,median_inc_county
0,1,1,4.0,1.8,3.1,1.2,6.7,5.9,4.0,39.1,3,9.7,80169
1,2,1,4.0,1.8,3.1,1.2,6.7,5.9,4.0,39.1,3,9.7,80169
2,3,1,4.0,5.8,7.1,0.8,24.8,8.8,6.0,23.3,3,13.0,60044
3,4,1,4.0,2.2,12.9,0.8,17.5,8.6,4.9,52.7,3,16.4,64509
4,5,1,66.2,0.5,26.6,2.4,31.8,17.9,14.7,42.4,1,29.1,38514
5,6,1,62.0,9.9,39.3,1.4,19.1,19.8,19.3,74.2,1,34.7,35407
6,7,1,61.4,3.0,8.8,1.3,20.7,6.6,10.5,56.0,1,22.2,46779
7,8,1,53.6,2.0,20.5,0.9,9.9,11.5,8.6,56.6,3,18.9,59329
8,9,1,53.1,5.0,0.0,1.8,29.2,11.8,12.0,33.8,1,26.2,46411
9,10,1,51.0,3.0,8.8,1.3,20.7,6.6,10.5,56.0,1,22.2,46779


## Data Pre-Processing

In [5]:
dummy = pd.get_dummies(df["county_tier"],prefix="county_tier")
dummy.head()

Unnamed: 0,county_tier_1,county_tier_2,county_tier_3
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,1,0,0


In [43]:
df2=pd.concat([df,dummy],axis=1)
df2.drop(columns=["county_tier"],inplace=True)
df2.head()
df2.describe()

Unnamed: 0,id,EOG_quintile,percent_EDS,child_abuse_rate,children_conc_pov,parent,juv_delinquent,No_HSdegree,parent_unemployed,county_poc,poverty_county,median_inc_county,county_tier_1,county_tier_2,county_tier_3
count,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0,2492.0
mean,1246.5,2.606742,50.088202,3.530778,11.475201,1.302929,16.822191,11.319141,8.494141,35.621509,20.174358,54295.719502,0.275281,0.340289,0.38443
std,719.522759,0.872573,20.083281,2.798452,10.689638,0.570944,7.229138,4.133649,3.827049,16.090638,6.953433,12150.898796,0.446745,0.473901,0.486558
min,1.0,1.0,4.0,0.0,0.0,0.0,4.8,5.9,2.8,5.1,8.3,35407.0,0.0,0.0,0.0
25%,623.75,2.0,37.1,1.8,3.1,1.0,11.5,8.2,5.6,21.8,16.4,46342.0,0.0,0.0,0.0
50%,1246.5,3.0,51.9,2.7,8.8,1.2,17.3,10.5,8.0,36.3,20.0,51491.0,0.0,0.0,0.0
75%,1869.25,3.0,64.7,4.8,18.6,1.5,21.2,14.0,10.5,49.3,23.2,60296.0,1.0,1.0,1.0
max,2492.0,5.0,96.0,13.8,51.2,5.9,57.6,23.4,21.7,74.2,41.8,80428.0,1.0,1.0,1.0


In [7]:
X = df2.copy()
y = df2["EOG_quintile"].copy()
print(X.shape)

(2492, 15)


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [33]:
X_train_with_id, X_test_with_id, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y,train_size=0.70)
X_train = X_train_with_id.drop(["id","EOG_quintile"],axis=1)
X_test = X_test_with_id.drop(["id","EOG_quintile"],axis=1)
X_train.shape

(1246, 13)

In [34]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [36]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Create a Deep Learning Model

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [38]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=13))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=5, activation='softmax'))

In [39]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [40]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 100)               1400      
_________________________________________________________________
dense_10 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_11 (Dense)             (None, 5)                 505       
Total params: 12,005
Trainable params: 12,005
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Train on 1246 samples
Epoch 1/60
1246/1246 - 0s - loss: 1.3794 - accuracy: 0.4053
Epoch 2/60
1246/1246 - 0s - loss: 1.1956 - accuracy: 0.4912
Epoch 3/60
1246/1246 - 0s - loss: 1.1181 - accuracy: 0.5313
Epoch 4/60
1246/1246 - 0s - loss: 1.0637 - accuracy: 0.5337
Epoch 5/60
1246/1246 - 0s - loss: 1.0315 - accuracy: 0.5433
Epoch 6/60
1246/1246 - 0s - loss: 1.0031 - accuracy: 0.5626
Epoch 7/60
1246/1246 - 0s - loss: 0.9919 - accuracy: 0.5714
Epoch 8/60
1246/1246 - 0s - loss: 0.9887 - accuracy: 0.5730
Epoch 9/60
1246/1246 - 0s - loss: 0.9757 - accuracy: 0.5859
Epoch 10/60
1246/1246 - 0s - loss: 0.9755 - accuracy: 0.5803
Epoch 11/60
1246/1246 - 0s - loss: 0.9757 - accuracy: 0.5746
Epoch 12/60
1246/1246 - 0s - loss: 0.9734 - accuracy: 0.5867
Epoch 13/60
1246/1246 - 0s - loss: 0.9709 - accuracy: 0.5851
Epoch 14/60
1246/1246 - 0s - loss: 0.9631 - accuracy: 0.5827
Epoch 15/60
1246/1246 - 0s - loss: 0.9623 - accuracy: 0.5923
Epoch 16/60
1246/1246 - 0s - loss: 0.9628 - accuracy: 0.5867
Epoch 17/60

<tensorflow.python.keras.callbacks.History at 0x1f75b5e6320>

## Quantify our Trained Model

In [42]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

1246/1246 - 0s - loss: 1.0088 - accuracy: 0.5915
Normal Neural Network - Loss: 1.00878034787997, Accuracy: 0.591492772102356


## Make Predictions

In [None]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
prediction_labels.size

In [None]:
#print(f"Predicted classes: {prediction_labels}")
#print(f"Actual Labels: {list(y_test[:20])}")

In [None]:
#zip id back into the array and export

#X_test_withID.addColumn('pred', prediction_labels)
X_test_with_id['predictedEOG'] = prediction_labels.tolist()
X_test_with_id.head()

In [None]:
X_test_with_id.to_csv('main_data_test_result.csv')