In [1]:
# for reading data
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

# for modeling
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

import plotly as plt



example website
https://medium.com/luca-chuangs-bapm-notes/build-a-neural-network-in-python-multi-class-classification-e940f74bd899

In [2]:
#read in the data
#take input from provisional database (csv)
# Load the csv file from GitHub
url = 'https://raw.githubusercontent.com/mandymccabe/Final_Project/janet_branch/Data/Final_Project_Full.csv'
url2= 'https://raw.githubusercontent.com/mandymccabe/Final_Project/main/Resources/all_responses_coded.csv'
df = pd.read_csv(url, index_col=0)
df2 = pd.read_csv(url2, index_col=0)

In [3]:
clean_df2= df2.drop(['A1','A2','A3','A4','A5','A21','StartDate','EndDate'], axis=1)

In [37]:
PoliticalViews = df.filter(["political_views"], axis=1)
PoliticalViews.head()

Unnamed: 0_level_0,political_views
respondentid,Unnamed: 1_level_1
6176264298,Moderate
6176263960,Moderate
6176258621,Liberal
6176257082,Liberal
6176256111,Liberal


In [36]:
PoliticalViews3 = PoliticalViews.replace(regex={r'Very Conservative': 'Conservative', 'Very Liberal': 'Liberal'})
PoliticalViews3.head()

Unnamed: 0_level_0,political_views
respondentid,Unnamed: 1_level_1
6176264298,Moderate
6176263960,Moderate
6176258621,Liberal
6176257082,Liberal
6176256111,Liberal


In [35]:
merged_dfs = pd.merge(clean_df2, PoliticalViews3, how='outer', left_on=["RespondentID"], right_on=['respondentid'])
merged_dfs.head()

Unnamed: 0,RespondentID,A6,A7,A8,A9,A10,A11,A12,A13,A14,...,A54,A55,A56,A57,A58,A59,A60,A61,A62,political_views
0,6176264298,0,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,Moderate
1,6176263960,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,Moderate
2,6176258621,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,Liberal
3,6176257082,0,1,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,Liberal
4,6176256111,0,1,0,0,1,1,1,0,0,...,0,0,1,1,0,0,0,0,0,Liberal


In [8]:
# split into X and Y
Y = merged_dfs['political_views']
X = merged_dfs.drop(['political_views', 'RespondentID'], axis=1)

print(X.shape)
print(Y.shape)

# convert to numpy arrays
X = np.array(X)

(1021, 56)
(1021,)


In [34]:
# show Y
Y.head()

0    Moderate
1    Moderate
2     Liberal
3     Liberal
4     Liberal
Name: political_views, dtype: object

In [10]:
# work with labels
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [11]:
print(encoded_Y)

[2 2 1 ... 2 2 2]


In [12]:
print(dummy_y)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [13]:
#split into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, dummy_y, random_state=3, train_size=0.80)

In [14]:
number_inputs= len(X[0])
hidden_nodes_1 = 8
hidden_nodes_2 =8
hidden_nodes_3=8


In [40]:
# build a model
model = Sequential()
model.add(Dense(units=hidden_nodes_1, activation='relu', input_dim= number_inputs))
#model.add(Dense(units=hidden_nodes_2, activation='relu'))
#model.add(Dense(units=hidden_nodes_3, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 8)                 456       
                                                                 
 dense_12 (Dense)            (None, 3)                 27        
                                                                 
Total params: 483
Trainable params: 483
Non-trainable params: 0
_________________________________________________________________


In [41]:
# compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [42]:
import keras
from keras.callbacks import EarlyStopping

# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                   mode='min',
                                   patience=10, 
                                   restore_best_weights=True) # important - otherwise you just return the last weigths...



In [43]:
# now we just update our model fit call
history = model.fit(X_train,
                    y_train,
                    epochs=100, 
                    verbose=1,
                    callbacks=[es],
                    batch_size=10,
                    shuffle=True,
                    validation_split=0.2)
                    

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


In [44]:

history_dict = history.history

# learning curve
# accuracy
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']


In [45]:
# loss
loss = history_dict['loss']
val_loss = history_dict['val_loss']


In [46]:
# range of X (no. of epochs)
epochs = range(1, len(acc) + 1)
epochs

range(1, 29)

In [47]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

preds = model.predict(X_test) # see how the model did!




In [48]:
print(preds) 

[[0.18725228 0.28526843 0.52747923]
 [0.01357383 0.75582945 0.23059669]
 [0.0373091  0.54346406 0.4192269 ]
 [0.2808528  0.32803568 0.3911115 ]
 [0.32277486 0.13760681 0.5396184 ]
 [0.3672458  0.17838435 0.45436984]
 [0.3281738  0.19514132 0.47668493]
 [0.5087459  0.13407119 0.35718295]
 [0.5634117  0.03529838 0.40128988]
 [0.01678196 0.6557025  0.32751557]
 [0.09033232 0.14346536 0.7662023 ]
 [0.04338523 0.36390415 0.5927106 ]
 [0.68433434 0.06903674 0.24662893]
 [0.01544584 0.7928865  0.19166772]
 [0.4006362  0.0940701  0.5052936 ]
 [0.4997635  0.07686107 0.42337546]
 [0.56860214 0.05708667 0.37431118]
 [0.579428   0.06344701 0.35712504]
 [0.1361297  0.3200514  0.54381883]
 [0.14722541 0.06786756 0.78490704]
 [0.05581466 0.4920402  0.4521452 ]
 [0.25220406 0.11214209 0.6356539 ]
 [0.20000313 0.23296282 0.56703407]
 [0.03632656 0.21576597 0.74790746]
 [0.07566327 0.5572878  0.36704886]
 [0.07694013 0.585649   0.33741093]
 [0.01715343 0.6552897  0.32755685]
 [0.54627216 0.07623711 0.37

In [49]:
matrix = confusion_matrix(y_test.argmax(axis=1), preds.argmax(axis=1))
matrix

array([[41,  3, 12],
       [ 1, 42, 20],
       [19, 28, 39]], dtype=int64)

In [50]:
print(classification_report(y_test.argmax(axis=1), preds.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.67      0.73      0.70        56
           1       0.58      0.67      0.62        63
           2       0.55      0.45      0.50        86

    accuracy                           0.60       205
   macro avg       0.60      0.62      0.61       205
weighted avg       0.59      0.60      0.59       205

