In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, concatenate
from tensorflow.keras.optimizers import SGD, Adagrad, RMSprop, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Import our input dataset
df = pd.read_csv('charity_data.csv')
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# clean with drop na and duplicates
df = df.drop_duplicates()
df = df.dropna()
df.describe()

Unnamed: 0,EIN,STATUS,ASK_AMT,IS_SUCCESSFUL
count,34299.0,34299.0,34299.0,34299.0
mean,519185200.0,0.999854,2769199.0,0.532406
std,245147200.0,0.012073,87130450.0,0.498956
min,10520600.0,0.0,5000.0,0.0
25%,274848200.0,1.0,5000.0,0.0
50%,465631700.0,1.0,5000.0,1.0
75%,752611700.0,1.0,7742.0,1.0
max,996086900.0,1.0,8597806000.0,1.0


In [3]:
# drop non-useful columns
# df = df.drop(["EIN","NAME","STATUS","SPECIAL_CONSIDERATIONS"],axis=1)
df = df.drop(["EIN"],axis=1)
df.head()

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# generate list of categorical variables
cat = df.dtypes[df.dtypes=="object"].index.tolist()
cat

['NAME',
 'APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [5]:
# check number of unique values in each category for grouping/encoding
df[cat].nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
dtype: int64

In [6]:
# check counts of each unique value in the category
app_counts = df["APPLICATION_TYPE"].value_counts()
class_counts = df["CLASSIFICATION"].value_counts()
income_counts = df["INCOME_AMT"].value_counts()
name_counts = df["NAME"].value_counts()

## Application Types with less than 500 entries will be grouped
## Classification with less than 200 entries will be grouped
## Income will not be grouped at this time
## Names will be grouped

In [7]:
ord_dict = {
    '0':1,
    '1-9999':2,
    '10000-24999':3,
    '25000-99999':4,
    '100000-499999':5,
    '1M-5M':6,
    '5M-10M':7,
    '10M-50M':8,
    '50M+':9
}

# Testing a different way to encode income amt

df["INCOME_AMT_ORD"] = df["INCOME_AMT"].map(ord_dict)
df = df.drop("INCOME_AMT", axis=1)

In [8]:
# Determine which values to replace
replace_apps = list(app_counts[app_counts < 25].index)
replace_class = list(class_counts[class_counts < 25].index)
replace_names = list(name_counts[name_counts < 5].index)

# Replace in DataFrame

df["APPLICATION_TYPE"] = df['APPLICATION_TYPE'].map(lambda x: x)

for i in replace_apps:
    df["APPLICATION_TYPE"] = df["APPLICATION_TYPE"].replace(i,"Other")
for i in replace_class:
    df["CLASSIFICATION"] = df["CLASSIFICATION"].replace(i,"Other")
for i in replace_names:
    df["NAME"] = df["NAME"].replace(i,"Other")
    

In [9]:
df["IS_SUCCESSFUL"].value_counts()

1    18261
0    16038
Name: IS_SUCCESSFUL, dtype: int64

In [10]:
df.dtypes

NAME                      object
APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
INCOME_AMT_ORD             int64
dtype: object

In [11]:
# encode all categorical variables with OneHotEncoders
enc = OneHotEncoder(sparse=False)

# rerun the category definer
cat = df.dtypes[df.dtypes=="object"].index.tolist()

# fit and then produce the encoder
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_ACTS MINISTRY,NAME_ACTS MISSIONS,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,NAME_ALABAMA FEDERATION OF WOMENS CLUBS,NAME_ALABAMA TREASURE FOREST ASSOCIATION,NAME_ALBANY STATE UNIVERSITY NATIONAL ALUMNI ASSOCIATION,NAME_ALPHA PHI OMEGA,...,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [12]:
# Merge the encoded columns then drop the original
df_merge = df.merge(encode_df,left_index=True,right_index=True)
df_merge = df_merge.drop(cat,1)
df_merge.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,INCOME_AMT_ORD,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_ACTS MINISTRY,NAME_ACTS MISSIONS,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,...,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,108590,1,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1,5000,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,6692,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1,142590,1,5,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [13]:
df_merge.columns

Index(['STATUS', 'ASK_AMT', 'IS_SUCCESSFUL', 'INCOME_AMT_ORD',
       'NAME_AACE INTERNATIONAL', 'NAME_ACE MENTOR PROGRAM OF AMERICA INC',
       'NAME_ACTS MINISTRY', 'NAME_ACTS MISSIONS',
       'NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS',
       'NAME_AIR FORCE ASSOCIATION',
       ...
       'USE_CASE_Heathcare', 'USE_CASE_Other', 'USE_CASE_Preservation',
       'USE_CASE_ProductDev', 'ORGANIZATION_Association',
       'ORGANIZATION_Co-operative', 'ORGANIZATION_Corporation',
       'ORGANIZATION_Trust', 'SPECIAL_CONSIDERATIONS_N',
       'SPECIAL_CONSIDERATIONS_Y'],
      dtype='object', length=457)

In [14]:
# split features and targets
y = df_merge["IS_SUCCESSFUL"]

X = df_merge.copy()
X = X.drop(["IS_SUCCESSFUL"],axis=1)

In [15]:
# split training and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24)

In [16]:
# scale all the encoded data for both train and test sets
scaler = StandardScaler()
# scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [17]:
# Start the configuration of the Deep Learning Model
X_train_scaled[0].shape
y_train.shape

(25724,)

In [18]:
# define the model
base = len(X_train_scaled[0])
num_input = base
n_branchA = base * 1.25
n_branchB = 2
n_layer1 = base
n_layer2 = base / 1.5
n_layer3 = base

# Start creating the model inputs, layers and outputs

inputs = Input(shape=X_train_scaled[0].shape)
branchA = Dense(n_branchA, activation="relu")(inputs)
branchB = Dense(n_branchB, activation="sigmoid")(inputs)

# hidden1 = Dense(n_layer1, activation="relu")(branchA)

# hidden2 = Dense(n_layer2, activation="relu")(branchB)

# combined = concatenate([hidden1,hidden2])

combined = concatenate([branchA,branchB])


hidden3 = Dense(n_layer2, activation="relu")(combined)

output = Dense(1, activation="sigmoid")(hidden3)

nn = Model(inputs=inputs,outputs=output)

nn.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 456)]        0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 570)          260490      input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 2)            914         input_1[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 572)          0           dense[0][0]                      
                                                                 dense_1[0][0]                

In [21]:
# test changing different hyperparameters

opt = SGD(lr=0.01, momentum=0.9, decay=0.01)
# opt = Adam()

# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=15,
                                             min_delta=0.001)

rlrop = ReduceLROnPlateau(monitor='loss',factor=0.2,patience=5,
                         min_lr=0.001)

In [22]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

# Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=50)
fit_model = nn.fit(X_train_scaled, y_train, epochs=200, callbacks=[rlrop,early_stop])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Train on 25724 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
8575/8575 - 1s - loss: 0.4368 - accuracy: 0.7939
Loss: 0.436799371127832, Accuracy: 0.7939358353614807
