# Loading and Processing Data

In [196]:
! pip install keras_tuner



In [197]:
# Dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import tensorflow as tf
import keras_tuner as kt
from sklearn.pipeline import Pipeline
from keras.layers import Dense, Dropout, Flatten, Activation
import h5py

In [198]:
file_path = '/content/charity_data.csv'
data_df = pd.read_csv(file_path)

In [199]:
data_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [200]:
data_df.dtypes

EIN                        int64
NAME                      object
APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [201]:
data_df.nunique()

EIN                       34299
NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [202]:
data_df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
34294    False
34295    False
34296    False
34297    False
34298    False
Length: 34299, dtype: bool

In [203]:
data_df["INCOME_AMT"] = data_df["INCOME_AMT"].str.split('-',n=1, expand=True)

In [204]:
data_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000,N,142590,1


In [205]:
def add(v_range):    
    if "M" in v_range:
        return 10**6
    return v_range

In [206]:
data_df["INCOME_AMT"] = data_df["INCOME_AMT"].apply(add)

In [207]:
data_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000,N,142590,1


In [208]:
data_df["INCOME_AMT"] = data_df["INCOME_AMT"].astype(str).astype(float)
data_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0.0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1.0,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0.0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000.0,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000.0,N,142590,1


In [209]:
data_df.dtypes

EIN                         int64
NAME                       object
APPLICATION_TYPE           object
AFFILIATION                object
CLASSIFICATION             object
USE_CASE                   object
ORGANIZATION               object
STATUS                      int64
INCOME_AMT                float64
SPECIAL_CONSIDERATIONS     object
ASK_AMT                     int64
IS_SUCCESSFUL               int64
dtype: object

#### Data Cleaning
The following is observed from the cells above and correct appropriately:
1. We change to change the Income_Amt column from string to integer and carry out a split to obtain the actual amount by defining a function.
2. We reduce the number of unique values by binning 'rare' categorical variables.

In [210]:
# Reduce categorical variables (APPLICATION_TYPE)
t_others = data_df['APPLICATION_TYPE'].value_counts().loc[lambda x: x<27]
application_to_replace = t_others.index.tolist()
for t in application_to_replace:
    data_df['APPLICATION_TYPE'] = data_df['APPLICATION_TYPE'].replace(t,"other")
# Confirm binning was successful
data_df['APPLICATION_TYPE'].value_counts(sort=True).reset_index()

Unnamed: 0,index,APPLICATION_TYPE
0,T3,27037
1,T4,1542
2,T6,1216
3,T5,1173
4,T19,1065
5,T8,737
6,T7,725
7,T10,528
8,T9,156
9,T13,66


In [211]:
data_df['CLASSIFICATION'].value_counts(sort=True).reset_index()

Unnamed: 0,index,CLASSIFICATION
0,C1000,17326
1,C2000,6074
2,C1200,4837
3,C3000,1918
4,C2100,1883
...,...,...
66,C1245,1
67,C4500,1
68,C4200,1
69,C1580,1


In [212]:
#Similarly
other_clas = data_df['CLASSIFICATION'].value_counts().loc[lambda x: x<800]
classifications_to_replace = other_clas.index.tolist()
# Replace in dataframe
for clas in classifications_to_replace:
    data_df['CLASSIFICATION'] = data_df['CLASSIFICATION'].replace(clas,"Combined_class")
# Check to make sure binning was successful
data_df['CLASSIFICATION'].value_counts(sort=True).reset_index()

Unnamed: 0,index,CLASSIFICATION
0,C1000,17326
1,C2000,6074
2,C1200,4837
3,Combined_class,2261
4,C3000,1918
5,C2100,1883


In [213]:
data_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0.0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1.0,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0.0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000.0,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000.0,N,142590,1


In [214]:
data_df.isnull().sum()

EIN                       0
NAME                      0
APPLICATION_TYPE          0
AFFILIATION               0
CLASSIFICATION            0
USE_CASE                  0
ORGANIZATION              0
STATUS                    0
INCOME_AMT                0
SPECIAL_CONSIDERATIONS    0
ASK_AMT                   0
IS_SUCCESSFUL             0
dtype: int64

In [215]:
# Drop irrelevant columns
data_df = data_df.drop(['EIN', 'NAME','APPLICATION_TYPE','CLASSIFICATION'], axis=1)
data_df.head()

Unnamed: 0,AFFILIATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,Independent,ProductDev,Association,1,0.0,N,5000,1
1,Independent,Preservation,Co-operative,1,1.0,N,108590,1
2,CompanySponsored,ProductDev,Association,1,0.0,N,5000,0
3,CompanySponsored,Preservation,Trust,1,10000.0,N,6692,1
4,Independent,Heathcare,Trust,1,100000.0,N,142590,1


In [216]:
# Get list of categorical variables
data_cat = data_df.dtypes[data_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
data_df[data_cat].nunique()

AFFILIATION               6
USE_CASE                  5
ORGANIZATION              4
SPECIAL_CONSIDERATIONS    2
dtype: int64

In [217]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

enc_cats = enc.fit_transform(data_df[data_cat])

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc_cats)

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(data_cat)
encode_df.head()

Unnamed: 0,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [218]:
# Merge one-hot encoded features and drop the originals
data_df = data_df.merge(encode_df, left_index=True, right_index=True)
data_df = data_df.drop(data_cat, 1)
data_df.head()

Unnamed: 0,STATUS,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL,AFFILIATION_CompanySponsored,AFFILIATION_Family/Parent,AFFILIATION_Independent,AFFILIATION_National,AFFILIATION_Other,AFFILIATION_Regional,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,0.0,5000,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,1.0,108590,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1,0.0,5000,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,10000.0,6692,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1,100000.0,142590,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [219]:
# Split our preprocessed data into our features and target arrays
y = data_df["IS_SUCCESSFUL"].values
X = data_df.drop(["IS_SUCCESSFUL"],1).values

In [220]:
y.shape

(34299,)

In [221]:
X

array([[1.0000000e+00, 0.0000000e+00, 5.0000000e+03, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.0000000e+00, 1.0859000e+05, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00, 5.0000000e+03, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       ...,
       [1.0000000e+00, 0.0000000e+00, 5.0000000e+03, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 0.0000000e+00, 5.0000000e+03, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 1.0000000e+06, 3.6500179e+07, ..., 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00]])

In [222]:
# Use sklearn to split dataset
# Use sklearn to split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [223]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Create model function
1. Define the NN model
2. Add hidden layers using relu
3. Output layer using sigmoid function
4. Check model

In [224]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    units_param = hp.Int('first_units', min_value=1, max_value=10, step=2)
    nn_model.add(tf.keras.layers.Dense(units=units_param, activation=activation, input_shape=X.shape[1:]))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 4)):
      units_param = hp.Int('units_'+ str(i), min_value=1, max_value=10, step=2)
      nn_model.add(tf.keras.layers.Dense(units=units_param, activation=activation))

    # Add output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [225]:
# Define tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=10,
    hyperband_iterations=2)

INFO:tensorflow:Reloading Oracle from existing project ./untitled_project/oracle.json
INFO:tensorflow:Reloading Tuner from ./untitled_project/tuner0.json


In [226]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=5, validation_data=(X_test_scaled,y_test))

INFO:tensorflow:Oracle triggered exit


**Get best parameters and best model**

In [227]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 3,
 'num_layers': 1,
 'tuner/bracket': 2,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 3,
 'tuner/round': 1,
 'tuner/trial_id': '03940930b112700c1691ecd7d215c55a',
 'units_0': 7,
 'units_1': 9,
 'units_2': 3,
 'units_3': 9}

In [228]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.6122 - accuracy: 0.6970 - 450ms/epoch - 2ms/step
Loss: 0.612191915512085, Accuracy: 0.697026252746582


In [238]:
finalmodel = best_model.evaluate(X_test_scaled,y_test,verbose=2)

268/268 - 0s - loss: 0.6122 - accuracy: 0.6970 - 254ms/epoch - 948us/step


In [239]:
best_model.save("my_model.h5")


From the optimization process where we remove more features, in this case APPLICATION_TYPE and CLASSIFICATION we get a Loss of 0.612 and an accuracy of 0.697.