In [1]:
#
# Part 1 - load the data and required modules
#
import sys

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve
import pandas as pd
import numpy as np
from six import StringIO
from matplotlib import pyplot as plt
import os

# adjust Pandas printing behavior
pd.set_option("display.max_rows", None)

# read data from file
os.chdir("C:\\temp")
df = pd.read_csv("./countywide_covid_19_data.csv")

# inspect the dataframe, verify it looks ok
print("")
print("First 10 features:")
print(df.iloc[0:9,1:13])  # state + county + first 10 features
print("")
print("Labels:")
print(df.iloc[0:9,[1,2,-5,-4,-3,-2,-1]]) # state + county + labels

# inspect categorical variables
print("")
print("Distribution of PSI labels:")
print("Label Count")
print("----- -----")
PSI = df["Y04"].astype('category')
print(PSI.value_counts().sort_index())

print("")
print("Distribution of DRP labels:")
print("Label Count")
print("----- -----")
DRP = df["Y05"].astype('category')
print(DRP.value_counts().sort_index())

# read meta data from file
mdf = pd.read_csv("./countywide_covid_19_metadata.csv")

# inspect the metadataframe, verify it looks ok
print("")
print("Metadata:")
print(mdf.iloc[:,0:3])

#
# Part 2 - select features and label
#

# extract features and labels
features =  [ "X{:02d}".format(i) for i in range(1,11)]
label    = "Y05"
X = df[features]
Y = df[label]

# get metadata descriptions
X_desc = mdf[mdf.Code.isin(X.columns.values)]
print("")
print("Selected features:")
print(X_desc.iloc[:,0:3])
Y_desc = mdf[mdf.Code == label]
print("")
print("Selected label:")
print(Y_desc.iloc[:,0:3])

# encode labels per needs of RF algorithm
Y = pd.get_dummies(Y)

#
# Part 3 - create and train the model
#

# split into train and test datasets
X_train, X_test, \
Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

# create and train the classifier
model = RandomForestClassifier(\
    criterion='gini',    \
    oob_score=True,         \
    random_state=1)
model.fit(X_train, Y_train)

# feature importance
fimp = pd.DataFrame(model.feature_importances_, columns=["Importance"])
fimp["Feature"] = X.columns
fimp["Description"] = X_desc["Variable"].to_list()
fimp = fimp[["Feature", "Importance", "Description"]]
fimp = fimp.sort_values("Importance", ascending=False)
print("")
print("Feature  Importance  Description")
for index,item in fimp.iterrows():
    print("{:7s}  {:10.8f}  {:s}".format(item["Feature"],item["Importance"], item["Description"]))
print("")

#
# Part 4 - evaluate the model using test data
#

n_test    = len(Y_test)
n_classes = Y.shape[1]

# probs = probability of 0 and 1 for each class for each entry in the test data
probs = model.predict_proba(X_test)

# pi = probability of 1 for each class for each entry in the test data
pi = [ [ probs[i][j][1] for i in range(0,n_classes)] for j in range(0,n_test)]

# Y_pred = most likely class for each entry in the test data
Y_pred = [1+np.argmax(pi[i]) for i in range(0,n_test)]

# decode one-hot values
Y_actual = [1+np.argmax(Y_test.iloc[i,:]) for i in range(0,n_test)]

ca = accuracy_score(Y_actual, Y_pred)
print("Classification Accuracy : {:.2f}".format(ca))

# confusion matrix
cm = confusion_matrix(Y_actual, Y_pred)
C = set(list(Y_pred)+list(Y_actual)) # category names
cm = pd.DataFrame(cm,index=C,columns=C)

print("")
print("Confusion Matrix")
print(cm)
print("")



First 10 features:
  State           County  X01     X02  X03  X04  X05   X06   X07   X08   X09  \
0    AL   Autauga County    0   55869 -0.3    2    2  11.3  32.6  28.4  27.7   
1    AL   Baldwin County    5  223234  0.4    3    2   9.7  27.6  31.3  31.3   
2    AL   Barbour County    3   24686  0.5    6    6  27.0  35.7  25.1  12.2   
3    AL      Bibb County    0   22394  0.4    1    1  16.8  47.3  24.4  11.5   
4    AL    Blount County    0   57826  0.1    1    1  19.8  34.0  33.5  12.6   
5    AL   Bullock County    3   10101 -0.1    6    6  24.8  39.7  22.3  13.3   
6    AL    Butler County    0   19448  0.9    6    6  15.4  43.9  24.6  16.1   
7    AL   Calhoun County    4  113605  0.1    3    2  15.9  32.4  33.7  18.0   
8    AL  Chambers County    0   33254  0.2    6    5  18.6  38.4  29.7  13.2   

    X10  
0  13.8  
1   9.8  
2  30.9  
3  21.8  
4  13.2  
5  42.5  
6  24.5  
7  19.5  
8  18.7  

Labels:
  State           County    Y01  Y02   Y03  Y04  Y05
0    AL   Autauga

In [2]:
#
# Part 1 - load the data and required modules
#
import sys

from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve
import pandas as pd
import numpy as np
from six import StringIO
from matplotlib import pyplot as plt
import os

# adjust Pandas printing behavior
pd.set_option("display.max_rows", None)

# read data from file
os.chdir("C:\\temp")
df = pd.read_csv("./countywide_covid_19_data.csv")

# inspect the dataframe, verify it looks ok
print("")
print("First 10 features:")
print(df.iloc[0:9,1:13])  # state + county + first 10 features
print("")
print("Labels:")
print(df.iloc[0:9,[1,2,-5,-4,-3,-2,-1]]) # state + county + labels

# inspect categorical variables
print("")
print("Distribution of PSI labels:")
print("Label Count")
print("----- -----")
PSI = df["Y04"].astype('category')
print(PSI.value_counts().sort_index())

print("")
print("Distribution of DRP labels:")
print("Label Count")
print("----- -----")
DRP = df["Y05"].astype('category')
print(DRP.value_counts().sort_index())

# read meta data from file
mdf = pd.read_csv("./countywide_covid_19_metadata.csv")

# inspect the metadataframe, verify it looks ok
print("")
print("Metadata:")
print(mdf.iloc[:,0:3])

#
# Part 2 - select features and label
#

# extract features and labels
features =  [ "X{:02d}".format(i) for i in range(1,11)]
label    = "Y05"
X = df[features]
Y = df[label]

# get metadata descriptions
X_desc = mdf[mdf.Code.isin(X.columns.values)]
print("")
print("Selected features:")
print(X_desc.iloc[:,0:3])
Y_desc = mdf[mdf.Code == label]
print("")
print("Selected label:")
print(Y_desc.iloc[:,0:3])

# encode labels per needs of SVM algorithm
Y = pd.get_dummies(Y)

#
# Part 3 - create and train the model
#

# split into train and test datasets
X_train, X_test, \
Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

# scale the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train.astype(np.float))
X_test  = scaler.fit_transform(X_test.astype(np.float))

# SVM wants 0-based integer coding for Y_test and Y_train
Y_train_as_int = [np.argmax(Y_train.iloc[i,:]) for i in range(0,len(Y_train))]
Y_test_as_int  = [np.argmax(Y_test.iloc[i,:])  for i in range(0,len(Y_test))]

# create and train the classifier
model = svm.SVC(               
    kernel='rbf',              
    gamma='scale',             
    probability = True,        
    random_state = 1,          
    tol = 1e-6,                
    class_weight = 'balanced', 
    C = 1.0)
model.fit(X_train, Y_train_as_int)

#
# Part 4 - evaluate the model using test data
#

n_test    = len(Y_test)
n_classes = Y.shape[1]

# probs = probability of 0 and 1 for each class for each entry in the test data
probs = model.predict_proba(X_test)

# pi = probability list
pi = [ [ probs[j][i] for i in range(0,n_classes)] for j in range(0,n_test)] 

# Y_pred = most likely class for each entry in the test data
Y_pred = [1+np.argmax(pi[i]) for i in range(0,n_test)]

# decode one-hot values
Y_actual = [1+np.argmax(Y_test.iloc[i,:]) for i in range(0,n_test)]

ca = accuracy_score(Y_actual, Y_pred)
print("Classification Accuracy : {:.2f}".format(ca))

# confusion matrix
cm = confusion_matrix(Y_actual, Y_pred)
C = set(list(Y_pred)+list(Y_actual)) # category names
cm = pd.DataFrame(cm,index=C,columns=C)

print("")
print("Confusion Matrix")
print(cm)
print("")



First 10 features:
  State           County  X01     X02  X03  X04  X05   X06   X07   X08   X09  \
0    AL   Autauga County    0   55869 -0.3    2    2  11.3  32.6  28.4  27.7   
1    AL   Baldwin County    5  223234  0.4    3    2   9.7  27.6  31.3  31.3   
2    AL   Barbour County    3   24686  0.5    6    6  27.0  35.7  25.1  12.2   
3    AL      Bibb County    0   22394  0.4    1    1  16.8  47.3  24.4  11.5   
4    AL    Blount County    0   57826  0.1    1    1  19.8  34.0  33.5  12.6   
5    AL   Bullock County    3   10101 -0.1    6    6  24.8  39.7  22.3  13.3   
6    AL    Butler County    0   19448  0.9    6    6  15.4  43.9  24.6  16.1   
7    AL   Calhoun County    4  113605  0.1    3    2  15.9  32.4  33.7  18.0   
8    AL  Chambers County    0   33254  0.2    6    5  18.6  38.4  29.7  13.2   

    X10  
0  13.8  
1   9.8  
2  30.9  
3  21.8  
4  13.2  
5  42.5  
6  24.5  
7  19.5  
8  18.7  

Labels:
  State           County    Y01  Y02   Y03  Y04  Y05
0    AL   Autauga

In [3]:
#
# Part 1 - load the data and required modules
#
import sys

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve
import pandas as pd
import numpy as np
from six import StringIO
from matplotlib import pyplot as plt
import os

# adjust Pandas printing behavior
pd.set_option("display.max_rows", None)

# read data from file
os.chdir("C:\\temp")
df = pd.read_csv("./countywide_covid_19_data.csv")

# inspect the dataframe, verify it looks ok
print("")
print("First 10 features:")
print(df.iloc[0:9,1:13])  # state + county + first 10 features
print("")
print("Labels:")
print(df.iloc[0:9,[1,2,-5,-4,-3,-2,-1]]) # state + county + labels

# inspect categorical variables
print("")
print("Distribution of PSI labels:")
print("Label Count")
print("----- -----")
PSI = df["Y04"].astype('category')
print(PSI.value_counts().sort_index())

print("")
print("Distribution of DRP labels:")
print("Label Count")
print("----- -----")
DRP = df["Y05"].astype('category')
print(DRP.value_counts().sort_index())

# read meta data from file
mdf = pd.read_csv("./countywide_covid_19_metadata.csv")

# inspect the metadataframe, verify it looks ok
print("")
print("Metadata:")
print(mdf.iloc[:,0:3])

#
# Part 2 - select features and label
#

# extract features and labels
features =  [ "X{:02d}".format(i) for i in range(1,11)]
label    = "Y05"
X = df[features]
Y = df[label]

# get metadata descriptions
X_desc = mdf[mdf.Code.isin(X.columns.values)]
print("")
print("Selected features:")
print(X_desc.iloc[:,0:3])
Y_desc = mdf[mdf.Code == label]
print("")
print("Selected label:")
print(Y_desc.iloc[:,0:3])

# encode labels per needs of SVM algorithm
Y = pd.get_dummies(Y)

#
# Part 3 - create and train the model
#

# split into train and test datasets
X_train, X_test, \
Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

# scale the feature data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.astype(np.float))
X_test  = scaler.fit_transform(X_test.astype(np.float))

# create and train the classifier
model = KNeighborsClassifier(
    n_neighbors = 25,
    weights = 'distance',
    algorithm = 'auto',
    metric = 'euclidean')
model.fit(X_train, Y_train)

#
# Part 4 - evaluate the model using test data
#

n_test    = len(Y_test)
n_classes = Y.shape[1]

# probs = probability of 0 and 1 for each class for each entry in the test data
probs = model.predict_proba(X_test)

# pi = probability list
pi = [ [ probs[i][j][1] for i in range(0,n_classes)] for j in range(0,n_test)] 

# Y_pred = most likely class for each entry in the test data
Y_pred = [1+np.argmax(pi[i]) for i in range(0,n_test)]

# decode one-hot values
Y_actual = [1+np.argmax(Y_test.iloc[i,:]) for i in range(0,n_test)]

ca = accuracy_score(Y_actual, Y_pred)
print("Classification Accuracy : {:.2f}".format(ca))

# confusion matrix
cm = confusion_matrix(Y_actual, Y_pred)
C = set(list(Y_pred)+list(Y_actual)) # category names
cm = pd.DataFrame(cm,index=C,columns=C)

print("")
print("Confusion Matrix")
print(cm)
print("")


First 10 features:
  State           County  X01     X02  X03  X04  X05   X06   X07   X08   X09  \
0    AL   Autauga County    0   55869 -0.3    2    2  11.3  32.6  28.4  27.7   
1    AL   Baldwin County    5  223234  0.4    3    2   9.7  27.6  31.3  31.3   
2    AL   Barbour County    3   24686  0.5    6    6  27.0  35.7  25.1  12.2   
3    AL      Bibb County    0   22394  0.4    1    1  16.8  47.3  24.4  11.5   
4    AL    Blount County    0   57826  0.1    1    1  19.8  34.0  33.5  12.6   
5    AL   Bullock County    3   10101 -0.1    6    6  24.8  39.7  22.3  13.3   
6    AL    Butler County    0   19448  0.9    6    6  15.4  43.9  24.6  16.1   
7    AL   Calhoun County    4  113605  0.1    3    2  15.9  32.4  33.7  18.0   
8    AL  Chambers County    0   33254  0.2    6    5  18.6  38.4  29.7  13.2   

    X10  
0  13.8  
1   9.8  
2  30.9  
3  21.8  
4  13.2  
5  42.5  
6  24.5  
7  19.5  
8  18.7  

Labels:
  State           County    Y01  Y02   Y03  Y04  Y05
0    AL   Autauga

In [4]:
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
tf.__version__

'1.14.0'

In [6]:
print(tf.matmul([[2.]], [[3.]]).numpy())

AttributeError: 'Tensor' object has no attribute 'numpy'

In [7]:
print(tf.matmul([[2.]], [[3.]]))

Tensor("MatMul_1:0", shape=(1, 1), dtype=float32)


In [8]:
import tensorflow.keras as keras

In [9]:
keras.__version__

'2.2.4-tf'

In [10]:
import tensorflow.keras.utils as keras_utils

In [11]:
keras_utils.to_categorical(list(range(4)),5)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [12]:
# suppress warnings to avoid output clutter
import warnings
warnings.filterwarnings(action = "ignore", category=DeprecationWarning)
warnings.filterwarnings(action = "ignore", category=FutureWarning)

In [13]:
# tensorflow and keras modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import utils
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Activation
from tensorflow.python.keras.layers import Dropout
from tensorflow.keras.optimizers import RMSprop

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [15]:
# adjust Pandas printing behavior
pd.set_option("display.max_rows", None)

# read data from file
os.chdir("C:\\temp")
df = pd.read_csv("./countywide_covid_19_data.csv")

# inspect the dataframe, verify it looks ok
print("")
print("First 10 features:")
print(df.iloc[0:9,1:13])  # state + county + first 10 features
print("")
print("Labels:")
print(df.iloc[0:9,[1,2,-5,-4,-3,-2,-1]]) # state + county + labels

# inspect categorical variables
print("")
print("Distribution of PSI labels:")
print("Label Count")
print("----- -----")
PSI = df["Y04"].astype('category')
print(PSI.value_counts().sort_index())

print("")
print("Distribution of DRP labels:")
print("Label Count")
print("----- -----")
DRP = df["Y05"].astype('category')
print(DRP.value_counts().sort_index())

# read meta data from file
mdf = pd.read_csv("./countywide_covid_19_metadata.csv")

# inspect the metadataframe, verify it looks ok
print("")
print("Metadata:")
print(mdf.iloc[:,0:3])



First 10 features:
  State           County  X01     X02  X03  X04  X05   X06   X07   X08   X09  \
0    AL   Autauga County    0   55869 -0.3    2    2  11.3  32.6  28.4  27.7   
1    AL   Baldwin County    5  223234  0.4    3    2   9.7  27.6  31.3  31.3   
2    AL   Barbour County    3   24686  0.5    6    6  27.0  35.7  25.1  12.2   
3    AL      Bibb County    0   22394  0.4    1    1  16.8  47.3  24.4  11.5   
4    AL    Blount County    0   57826  0.1    1    1  19.8  34.0  33.5  12.6   
5    AL   Bullock County    3   10101 -0.1    6    6  24.8  39.7  22.3  13.3   
6    AL    Butler County    0   19448  0.9    6    6  15.4  43.9  24.6  16.1   
7    AL   Calhoun County    4  113605  0.1    3    2  15.9  32.4  33.7  18.0   
8    AL  Chambers County    0   33254  0.2    6    5  18.6  38.4  29.7  13.2   

    X10  
0  13.8  
1   9.8  
2  30.9  
3  21.8  
4  13.2  
5  42.5  
6  24.5  
7  19.5  
8  18.7  

Labels:
  State           County    Y01  Y02   Y03  Y04  Y05
0    AL   Autauga

In [16]:
#
# Part 2 - select features and label
#

# extract features and labels
features =  [ "X{:02d}".format(i) for i in range(1,11)]
label    = "Y05"
X = df[features]
Y_raw = df[label]

# get metadata descriptions
X_desc = mdf[mdf.Code.isin(X.columns.values)]
print("")
print("Selected features:")
print(X_desc.iloc[:,0:3])
Y_desc = mdf[mdf.Code == label]
print("")
print("Selected label:")
print(Y_desc.iloc[:,0:3])

# encode labels per needs of SVM algorithm
Y = pd.get_dummies(Y_raw)



Selected features:
   Code               Variable  \
3   X01           ECONOMY_TYPE   
4   X02             POPULATION   
5   X03       IMMIGRATION_RATE   
6   X04  RURAL_URBAN_CONTINUUM   
7   X05        URBAN_INFLUENCE   
8   X06      PCT_NO_HS_DIPLOMA   
9   X07    PCT_HS_DIPLOMA_ONLY   
10  X08       PCT_SOME_COLLEGE   
11  X09       PCT_BS_OR_HIGHER   
12  X10         PCT_IN_POVERTY   

                                          Description  
3   County economic types: 0=Nonspecialized 1=Farm...  
4               Estimated resident population in 2019  
5   Net international migration rate (migrants per...  
6   A code indicating level of urbanization. 1 = m...  
7   A code indicating nearness of urban areas. 1 =...  
8   Percent of population with no high school diploma  
9   Percent of population with only a high school ...  
10            Percent of population with some college  
11  Percent of population with bachelor's degree o...  
12   Percent of population living below pover

In [17]:

#
# Part 3 - create and train the model
#

# split into train and test datasets
X_train, X_test, \
Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

# scale the feature data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.astype(np.float))
X_test  = scaler.fit_transform(X_test.astype(np.float))

# assign hyperparameters
classes = list(set(Y_raw))
num_classes = len(classes)
num_epochs = 200
num_batch = 25
learn_rate = 0.01

# create and train the classifier
# create the classifier
model = Sequential() # add layers one at a time
model.add(Dense(20, input_shape = X_train.shape[1:]))
model.add(Activation('relu'))
model.add(Dense(10))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.summary()

# select loss function and optimizer
loss_func = 'categorical_crossentropy'
opt = RMSprop(lr = learn_rate, decay = 1e-6)
model.compile(loss = loss_func,
              optimizer = opt,
              metrics=['accuracy'])

# train the classifier
results = model.fit(X_train, Y_train,
                    batch_size = num_batch,
                    epochs = num_epochs,
                    validation_data=(X_test, Y_test),
                    shuffle = True, verbose = 2)

# Plot training & validation accuracy values
plt.plot(results.history['accuracy'])
plt.plot(results.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(results.history['loss'])
plt.plot(results.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                220       
_________________________________________________________________
activation (Activation)      (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
activation_1 (Activation)    (None, 10)                0         
Total params: 540
Trainable params: 540
Non-trainable params: 0
_________________________________________________________________
Train 

KeyError: 'accuracy'