## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 


In [24]:
name_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
name_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [25]:
# Drop the non-beneficial ID columns, 'EIN' 
# Dropping additional columns
name_df = name_df.drop(['EIN','SPECIAL_CONSIDERATIONS','STATUS'],1)

  name_df = name_df.drop(['EIN','SPECIAL_CONSIDERATIONS','STATUS'],1)


In [26]:
# Determine the number of unique values in each column.
name_df.nunique()

NAME                19568
APPLICATION_TYPE       17
AFFILIATION             6
CLASSIFICATION         71
USE_CASE                5
ORGANIZATION            4
INCOME_AMT              9
ASK_AMT              8747
IS_SUCCESSFUL           2
dtype: int64

In [27]:
# Changing APPLICATION_TYPE to NAME value counts for binning
name_counts = name_df['NAME'].value_counts()
name_counts

PARENT BOOSTER USA INC                                                  1260
TOPS CLUB INC                                                            765
UNITED STATES BOWLING CONGRESS INC                                       700
WASHINGTON STATE UNIVERSITY                                              492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC                          408
                                                                        ... 
ST LOUIS SLAM WOMENS FOOTBALL                                              1
AIESEC ALUMNI IBEROAMERICA CORP                                            1
WEALLBLEEDRED ORG INC                                                      1
AMERICAN SOCIETY FOR STANDARDS IN MEDIUMSHIP & PSYCHICAL INVESTIGATI       1
WATERHOUSE CHARITABLE TR                                                   1
Name: NAME, Length: 19568, dtype: int64

In [28]:
# Choose a cutoff value and create a list of Names to be replaced
# use the variable name `name_types_to_replace`
# Updating the counts from 1000 to 500, 100
name_types_to_replace = list(name_counts[name_counts < 10].index)

# Replace in dataframe
for app in name_types_to_replace:
    name_df['NAME'] = name_df['NAME'].replace(app,"Other")

# Check to make sure binning was successful
name_df['NAME'].value_counts()

Other                                      21022
PARENT BOOSTER USA INC                      1260
TOPS CLUB INC                                765
UNITED STATES BOWLING CONGRESS INC           700
WASHINGTON STATE UNIVERSITY                  492
                                           ...  
CASCADE 4-H FOUNDATION                        10
FREE & ACCEPTED MASONS OF WASHINGTON          10
NEW MEXICO GARDEN CLUBS INC                   10
NATIONAL ASSOCIATION OF HISPANIC NURSES       10
UNION OF CALIFORNIA STATE WORKERS             10
Name: NAME, Length: 223, dtype: int64

In [29]:
# Look at CLASSIFICATION value counts for binning
classification_counts = name_df['CLASSIFICATION'].value_counts()
classification_counts

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [30]:
# You may find it helpful to look at CLASSIFICATION value counts >1
# Changing Classifications 1, 10, 100
classification_counts[classification_counts > 1000]

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [31]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
# Changing classification counts from 4000 to 1500, 2000

classifications_to_replace = list(classification_counts[classification_counts < 4000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    name_df['CLASSIFICATION'] = name_df['CLASSIFICATION'].replace(cls,"Other")
    
# Check to make sure binning was successful
name_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
Other     6062
C1200     4837
Name: CLASSIFICATION, dtype: int64

In [32]:
# Convert categorical data to numeric with `pd.get_dummies`
df_application_dummies = pd.get_dummies(name_df,dtype=float)

In [33]:
# Split our preprocessed data into our features and target arrays
y = df_application_dummies['IS_SUCCESSFUL'].values
X = df_application_dummies.drop(['IS_SUCCESSFUL'],1).values

# Split the preprocessed data into a training and testing dataset
# Changing random state from 33 to 3, 33,42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

  X = df_application_dummies.drop(['IS_SUCCESSFUL'],1).values


In [34]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [35]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
# Adding hidden layers. Updating counts of layer 1 from 42 to 99,66 Updating layer 2 from 9 to 42, adding 3 layer with a value of 9, 33 
# Layer 4 - 3, 9
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  66
hidden_nodes_layer2 =  9
hidden_nodes_layer3 =  3
hidden_nodes_layer4 =  9

nn = tf.keras.models.Sequential()

# First hidden layer
# Changing activation method from relu to tanh, selu, para_relu
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh")
)


# Second hidden layer
# Changing activation method from relu to 
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Fourth hidden layer
#nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="selu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 66)                17820     
                                                                 
 dense_7 (Dense)             (None, 9)                 603       
                                                                 
 dense_8 (Dense)             (None, 3)                 30        
                                                                 
 dense_9 (Dense)             (None, 1)                 4         
                                                                 
Total params: 18,457
Trainable params: 18,457
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])

In [37]:
# Train the model
# Increasing number of epochs from 100 to 103
fit_model = nn.fit(X_train_scaled,y_train,epochs=103)


Epoch 1/103
Epoch 2/103
Epoch 3/103
Epoch 4/103
Epoch 5/103
Epoch 6/103
Epoch 7/103
Epoch 8/103
Epoch 9/103
Epoch 10/103
Epoch 11/103
Epoch 12/103
Epoch 13/103
Epoch 14/103
Epoch 15/103
Epoch 16/103
Epoch 17/103
Epoch 18/103
Epoch 19/103
Epoch 20/103
Epoch 21/103
Epoch 22/103
Epoch 23/103
Epoch 24/103
Epoch 25/103
Epoch 26/103
Epoch 27/103
Epoch 28/103
Epoch 29/103
Epoch 30/103
Epoch 31/103
Epoch 32/103
Epoch 33/103
Epoch 34/103
Epoch 35/103
Epoch 36/103
Epoch 37/103
Epoch 38/103
Epoch 39/103
Epoch 40/103
Epoch 41/103
Epoch 42/103
Epoch 43/103
Epoch 44/103
Epoch 45/103
Epoch 46/103
Epoch 47/103
Epoch 48/103
Epoch 49/103
Epoch 50/103
Epoch 51/103
Epoch 52/103
Epoch 53/103
Epoch 54/103
Epoch 55/103
Epoch 56/103
Epoch 57/103
Epoch 58/103
Epoch 59/103
Epoch 60/103
Epoch 61/103
Epoch 62/103
Epoch 63/103
Epoch 64/103
Epoch 65/103
Epoch 66/103
Epoch 67/103
Epoch 68/103
Epoch 69/103
Epoch 70/103
Epoch 71/103
Epoch 72/103
Epoch 73/103
Epoch 74/103
Epoch 75/103
Epoch 76/103
Epoch 77/103
Epoch 78

In [38]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.4613 - accuracy: 0.7871 - 474ms/epoch - 2ms/step
Loss: 0.46127966046333313, Accuracy: 0.7870553731918335


In [40]:
# Export our model to HDF5 file
nn.save('AlphabetSoupCharity_Optimization.h5')