In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import sklearn
# from matplotlib import pyplot as plt


# Create Neural Network Model Function

In [2]:
# Create neural network model function
# Takes teds_cleaned_df arguement that contains model input features and target 
# Takes in nn_model_results dataframe for storing model performance results
def nn_model_1(teds_cleaned_df):

    # Split our preprocessed data into our features and target arrays

    y = teds_cleaned_df["SUCCESSFUL"].values
    X = teds_cleaned_df.drop(["REASON","SUCCESSFUL"],1).values

    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Define the model - deep neural net
    number_input_features = len(X_train[0])
    hidden_nodes_layer1 = 25
    hidden_nodes_layer2 = 20

    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(
        tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
    )

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    # Train the model
    fit_model = nn.fit(X_train_scaled,y_train,epochs=20)

    # Evaluate the model using the test data
    model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

    # Print confusion matrix
    y_pred = nn.predict(X_test_scaled)
    confusion_matrix = sklearn.metrics.confusion_matrix(y_test, np.rint(y_pred))

    # Create a DataFrame from the confusion matrix.
    nn_cm_df = pd.DataFrame(
        confusion_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Get the current year from the dataframe
    # year = teds_cleaned_df.iat[0,0].astype(int)
    year = teds_cleaned_df.iat[0,0]

    # Append results 
    nn_model_results.loc[len(nn_model_results)] = [year, model_loss, model_accuracy] 


    # Return results
    return nn_model_results, nn_cm_df

# Create Random Classifier Model Function

In [3]:
# Create Random Forest Clasifier model function
def rf_model_1(teds_cleaned_df):

    # Split our preprocessed data into our features and target arrays
    y = teds_cleaned_df["SUCCESSFUL"].values
    X = teds_cleaned_df.drop(["REASON","SUCCESSFUL"],axis=1)


    # Split the preprocessed data into a training and testing dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)

    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred = rf_model.predict(X_test_scaled)
    print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

    # Print confusion matrix
    confusion_matrix_rf = sklearn.metrics.confusion_matrix(y_test, np.rint(y_pred))

    # Create a DataFrame from the confusion matrix.
    rf_cm_df = pd.DataFrame(
        confusion_matrix_rf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Get the feature importances
    feature_importances = rf_model.feature_importances_

    # Create data frame of feature importances
    features = sorted(zip(X.columns, rf_model.feature_importances_), key = lambda x: x[1])
    features_df = pd.DataFrame(features, columns=["Feature", "Importance value"]).sort_values(['Importance value'], ascending=False)

    # Score the Random Classifier model
    RF_Training_Score = rf_model.score(X_train_scaled, y_train)
    RF_Testing_Score = rf_model.score(X_test_scaled, y_test)

    # Get the current year from the dataframe
    # year = teds_cleaned_df.iat[0,0].astype(int)
    year = teds_cleaned_df.iat[0,0]

    # Append results 
    rf_model_results.loc[len(rf_model_results)] = [year, RF_Training_Score, RF_Testing_Score] 

    # Return results
    return rf_model_results, rf_cm_df, features_df

# Run Machine Learning models on multi-year data and output results

In [4]:
# Load cleaned dataset excel file
# File path for TEDS 2019 dataset
# file_path = Path('Resources/teds_2019_cleaned.csv')

# Create file path list for cleaned datasets
file_path_2015 = Path('Resources/teds_2015_cleaned.csv')
file_path_2016 = Path('Resources/teds_2016_cleaned.csv')
file_path_2017 = Path('Resources/teds_2017_cleaned.csv')
file_path_2018 = Path('Resources/teds_2018_cleaned.csv')
file_path_2019 = Path('Resources/teds_2019_cleaned.csv')
file_paths = [file_path_2015, file_path_2016, file_path_2017, file_path_2018, file_path_2019]

# Create output file path list to export nn model confusion matrixs to .csv files 
nn_output_file_path_2015 = Path('Resources/ML_model_results/teds_2015_nn_model_cm.csv')
nn_output_file_path_2016 = Path('Resources/ML_model_results/teds_2016_nn_model_cm.csv')
nn_output_file_path_2017 = Path('Resources/ML_model_results/teds_2017_nn_model_cm.csv')
nn_output_file_path_2018 = Path('Resources/ML_model_results/teds_2018_nn_model_cm.csv')
nn_output_file_path_2019 = Path('Resources/ML_model_results/teds_2019_nn_model_cm.csv')
nn_output_file_paths = [nn_output_file_path_2015, nn_output_file_path_2016, nn_output_file_path_2017, 
                    nn_output_file_path_2018, nn_output_file_path_2019]

# Create output file path list to export rf model confusion matrixs to .csv files 
rf_output_file_path_2015 = Path('Resources/ML_model_results/teds_2015_rf_model_cm.csv')
rf_output_file_path_2016 = Path('Resources/ML_model_results/teds_2016_rf_model_cm.csv')
rf_output_file_path_2017 = Path('Resources/ML_model_results/teds_2017_rf_model_cm.csv')
rf_output_file_path_2018 = Path('Resources/ML_model_results/teds_2018_rf_model_cm.csv')
rf_output_file_path_2019 = Path('Resources/ML_model_results/teds_2019_rf_model_cm.csv')
rf_output_file_paths = [rf_output_file_path_2015, rf_output_file_path_2016, rf_output_file_path_2017, 
                    rf_output_file_path_2018, rf_output_file_path_2019]

# Create output file path list to export rf model feature importances to .csv files 
rf_fi_output_file_path_2015 = Path('Resources/ML_model_results/teds_2015_rf_model_fi.csv')
rf_fi_output_file_path_2016 = Path('Resources/ML_model_results/teds_2016_rf_model_fi.csv')
rf_fi_output_file_path_2017 = Path('Resources/ML_model_results/teds_2017_rf_model_fi.csv')
rf_fi_output_file_path_2018 = Path('Resources/ML_model_results/teds_2018_rf_model_fi.csv')
rf_fi_output_file_path_2019 = Path('Resources/ML_model_results/teds_2019_rf_model_fi.csv')
rf_fi_output_file_paths = [rf_fi_output_file_path_2015, rf_fi_output_file_path_2016, rf_fi_output_file_path_2017, 
                    rf_fi_output_file_path_2018, rf_fi_output_file_path_2019]

# Create empty dataframe to store results from NN and RF model per year
nn_model_results = pd.DataFrame(columns = ['Year', 'NN loss', 'NN accuracy']) 
rf_model_results = pd.DataFrame(columns = ['Year', 'RF Training Score', 'RF Testing Score']) 


# Loop thru data files for each year.  Load data, run model, output data for each year.
for (file_path, nn_output_file_path, rf_output_file_path, rf_fi_output_file_path) in \
    zip(file_paths, nn_output_file_paths, rf_output_file_paths, rf_fi_output_file_paths):
    
    # Read dataset into dataframe 
    teds_cleaned_df = pd.read_csv(file_path)

    # Run nn model with teds_cleaned_df and get results
    nn_model_results, nn_cm_df = nn_model_1(teds_cleaned_df)
    # Export nueral network model confusion matrix to .csv
    nn_cm_df.to_csv(nn_output_file_path, encoding='utf-8', index=True)

    # Run RF model with teds_cleaned_df and get results
    rf_model_results, rf_cm_df, features_df = rf_model_1(teds_cleaned_df)
    # Export rf model model confusion matrix to .csv
    rf_cm_df.to_csv(rf_output_file_path, encoding='utf-8', index=True)
    # Export rf model feature importance list to .csv
    features_df.to_csv(rf_fi_output_file_path, encoding='utf-8', index=False)


# Export NN model results across year(loss and accuracy results) to .csv file
output_path = Path('Resources/ML_model_results/teds_nn_model_results.csv')
nn_model_results = nn_model_results.round(3)
nn_model_results.to_csv(output_path, encoding='utf-8', index=False)

# Export NN model results across year(loss and accuracy results) to .csv file
output_path = Path('Resources/ML_model_results/teds_rf_model_results.csv')
rf_model_results = rf_model_results.round(3)
rf_model_results.to_csv(output_path, encoding='utf-8', index=False)

# Merge nn model results and rf model results into single dataframe and output to .csv file
model_results = pd.merge(nn_model_results, rf_model_results, on='Year')
model_results.Year = model_results.Year.astype(int)
model_results = model_results.round(3)
output_path = Path('Resources/ML_model_results/Teds_model_results.csv')
model_results.to_csv(output_path, encoding='utf-8', index=False)

# # Loop thru and import cleaned dataset .csv files, Call machince learning funtions to get model results, output results to .csv files
# for (file_path, output_file_path) in zip(file_paths, output_file_paths):
#     # Clean data with clean_data function with specified variables 
#     nn_model
#     # Print DISYR column for year after each read 
#     print(teds_cleaned_df.iat[0,0])
#     # Export cleaned dataframe to .csv file
#     teds_cleaned_df.to_csv(output_file_path, encoding='utf-8', index=False)

  if __name__ == '__main__':


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1023/1023 - 1s - loss: 0.4949 - accuracy: 0.7655 - 901ms/epoch - 881us/step
Loss: 0.49494606256484985, Accuracy: 0.7654841542243958
 Random forest predictive accuracy: 0.759


  if __name__ == '__main__':


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
349/349 - 0s - loss: 0.5344 - accuracy: 0.7313 - 365ms/epoch - 1ms/step
Loss: 0.5344115495681763, Accuracy: 0.7312589883804321
 Random forest predictive accuracy: 0.735


  if __name__ == '__main__':


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
2680/2680 - 2s - loss: 0.5114 - accuracy: 0.7532 - 2s/epoch - 698us/step
Loss: 0.5114250779151917, Accuracy: 0.7532047629356384
 Random forest predictive accuracy: 0.752


  if __name__ == '__main__':


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
3122/3122 - 2s - loss: 0.4601 - accuracy: 0.7863 - 2s/epoch - 762us/step
Loss: 0.46011796593666077, Accuracy: 0.7862734794616699
 Random forest predictive accuracy: 0.782


  if __name__ == '__main__':


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
3876/3876 - 3s - loss: 0.4669 - accuracy: 0.7848 - 3s/epoch - 714us/step
Loss: 0.4668596684932709, Accuracy: 0.7848065495491028
 Random forest predictive accuracy: 0.780


In [5]:

model_results.head()

Unnamed: 0,Year,NN loss,NN accuracy,RF Training Score,RF Testing Score
0,2015,0.495,0.765,0.991,0.759
1,2016,0.534,0.731,0.991,0.735
2,2017,0.511,0.753,0.983,0.752
3,2018,0.46,0.786,0.993,0.782
4,2019,0.467,0.785,0.988,0.78


In [6]:
nn_model_results.head()

Unnamed: 0,Year,NN loss,NN accuracy
0,2015.0,0.495,0.765
1,2016.0,0.534,0.731
2,2017.0,0.511,0.753
3,2018.0,0.46,0.786
4,2019.0,0.467,0.785


In [7]:
features_df.head(20)

Unnamed: 0,Feature,Importance value
27,AGE,0.115287
26,FRSTUSE1,0.091699
25,EDUC,0.069402
24,PSOURCE,0.061756
23,DSMCRIT,0.057082
22,REGION,0.053323
21,EMPLOY,0.052413
20,MARSTAT,0.049351
19,RACE,0.04681
18,SUB2,0.040613


In [8]:

rf_model_results.head()

Unnamed: 0,Year,RF Training Score,RF Testing Score
0,2015.0,0.991,0.759
1,2016.0,0.991,0.735
2,2017.0,0.983,0.752
3,2018.0,0.993,0.782
4,2019.0,0.988,0.78


In [9]:
rf_cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,88630,6297
Actual 1,21044,8043


# Updates required for the database functionality.  Too be added later

In [10]:
# # Load cleanded dataset from SQL database
# # Import dependencies
# from sqlalchemy import create_engine
# from config import db_password

# # Create connection to PostgreSQL database
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/TEDS"
# engine = create_engine(db_string)

# # Pull teds_cleaned data from SQL into a dataframe
# teds_cleaned_df = pd.read_sql_table(
#     'TEDS',
#     con=engine
# )

# teds_cleaned_df.head()

In [11]:
# # Export model results to SQL database.
# from sqlalchemy import create_engine
# from config import db_password

# # Create connection to PostgreSQL database
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/TEDS"
# engine = create_engine(db_string)
    

# # Export Confusion matrix df to SQL database
# nn_cm_df.to_sql(name='NN_CM', con=engine, if_exists='replace')

In [12]:
# # Export model results to SQL database
# from sqlalchemy import create_engine
# from config import db_password

# # Create connection to PostgreSQL database
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/TEDS"
# engine = create_engine(db_string)
    

# # Export Confusion matrix df to SQL database
# rf_cm_df.to_sql(name='RF_CM', con=engine, if_exists='replace')

# # Export feature importance
# features_df.to_sql(name='RFModel_feature_Importance', index=False, con=engine, if_exists='replace')