In [35]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [36]:
# Dependencies
import numpy as np
import pandas as pd

In [37]:
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

In [38]:
df = pd.read_csv("main_data_for_ML.csv")
df.head(10)

Unnamed: 0,id,EOG_quintile,percent_EDS,child_abuse_rate,children_conc_pov,parent,juv_delinquent,No_HSdegree,parent_unemployed,county_poc,county_tier,poverty_county,median_inc_county
0,1,1,4.0,1.8,3.1,1.2,6.7,5.9,4.0,39.1,3,9.7,80169
1,2,1,4.0,1.8,3.1,1.2,6.7,5.9,4.0,39.1,3,9.7,80169
2,3,1,4.0,5.8,7.1,0.8,24.8,8.8,6.0,23.3,3,13.0,60044
3,4,1,4.0,2.2,12.9,0.8,17.5,8.6,4.9,52.7,3,16.4,64509
4,5,1,66.2,0.5,26.6,2.4,31.8,17.9,14.7,42.4,1,29.1,38514
5,6,1,62.0,9.9,39.3,1.4,19.1,19.8,19.3,74.2,1,34.7,35407
6,7,1,61.4,3.0,8.8,1.3,20.7,6.6,10.5,56.0,1,22.2,46779
7,8,1,53.6,2.0,20.5,0.9,9.9,11.5,8.6,56.6,3,18.9,59329
8,9,1,53.1,5.0,0.0,1.8,29.2,11.8,12.0,33.8,1,26.2,46411
9,10,1,51.0,3.0,8.8,1.3,20.7,6.6,10.5,56.0,1,22.2,46779


## Data Pre-Processing

In [39]:
X = df
y = df["EOG_quintile"]
print(X.shape)

(2492, 13)


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [41]:
X_train_with_id, X_test_with_id, y_train, y_test = train_test_split(
    X, y, random_state=1, stratify=y,train_size=0.5)
X_train = X_train_with_id.drop(["id","EOG_quintile"],axis=1)
X_test = X_test_with_id.drop(["id","EOG_quintile"],axis=1)
X_train.shape

(1246, 11)

In [42]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [44]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

# Create a Deep Learning Model

In [45]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [46]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=11))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=5, activation='softmax'))

In [47]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [48]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 100)               1200      
_________________________________________________________________
dense_4 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 505       
Total params: 11,805
Trainable params: 11,805
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

Epoch 1/60
1246/1246 - 0s - loss: 1.4195 - acc: 0.3820
Epoch 2/60
1246/1246 - 0s - loss: 1.2272 - acc: 0.4864
Epoch 3/60
1246/1246 - 0s - loss: 1.1453 - acc: 0.5249
Epoch 4/60
1246/1246 - 0s - loss: 1.0854 - acc: 0.5425
Epoch 5/60
1246/1246 - 0s - loss: 1.0524 - acc: 0.5433
Epoch 6/60
1246/1246 - 0s - loss: 1.0292 - acc: 0.5514
Epoch 7/60
1246/1246 - 0s - loss: 1.0145 - acc: 0.5642
Epoch 8/60
1246/1246 - 0s - loss: 1.0065 - acc: 0.5666
Epoch 9/60
1246/1246 - 0s - loss: 1.0003 - acc: 0.5658
Epoch 10/60
1246/1246 - 0s - loss: 0.9892 - acc: 0.5706
Epoch 11/60
1246/1246 - 0s - loss: 0.9886 - acc: 0.5730
Epoch 12/60
1246/1246 - 0s - loss: 0.9845 - acc: 0.5787
Epoch 13/60
1246/1246 - 0s - loss: 0.9755 - acc: 0.5835
Epoch 14/60
1246/1246 - 0s - loss: 0.9810 - acc: 0.5706
Epoch 15/60
1246/1246 - 0s - loss: 0.9792 - acc: 0.5819
Epoch 16/60
1246/1246 - 0s - loss: 0.9707 - acc: 0.5859
Epoch 17/60
1246/1246 - 0s - loss: 0.9631 - acc: 0.5907
Epoch 18/60
1246/1246 - 0s - loss: 0.9611 - acc: 0.5883
E

<tensorflow.python.keras.callbacks.History at 0x17b14dbcc88>

## Quantify our Trained Model

In [50]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

1246/1246 - 0s - loss: 0.9880 - acc: 0.5947
Normal Neural Network - Loss: 0.9880268306831679, Accuracy: 0.5947030782699585


## Make Predictions

In [51]:
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
prediction_labels.size

1246

In [52]:
#print(f"Predicted classes: {prediction_labels}")
#print(f"Actual Labels: {list(y_test[:20])}")

In [53]:
#zip id back into the array and export

#X_test_withID.addColumn('pred', prediction_labels)
X_test_with_id['predictedEOG'] = prediction_labels.tolist()
X_test_with_id.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,EOG_quintile,percent_EDS,child_abuse_rate,children_conc_pov,parent,juv_delinquent,No_HSdegree,parent_unemployed,county_poc,county_tier,poverty_county,median_inc_county,predictedEOG
1820,1821,3,65.9,0.7,5.3,1.3,7.8,5.9,3.3,29.4,3,9.2,71920,3
1409,1410,3,41.5,0.7,5.3,1.3,7.8,5.9,3.3,29.4,3,9.2,71920,3
436,437,2,32.3,3.3,12.3,1.0,17.6,15.1,8.7,19.3,2,20.0,47758,2
1875,1876,3,80.3,0.2,18.6,1.1,22.1,9.1,8.0,49.3,2,22.1,52166,4
1863,1864,3,52.0,4.4,16.0,1.6,21.2,12.6,8.6,35.8,2,18.3,50480,3


In [55]:
X_test_with_id.to_csv('main_data_test_result.csv')