## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [None]:
# Determine the number of unique values in each column.
application_df.nunique()

Unnamed: 0,0
EIN,34299
NAME,19568
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2


In [None]:
application_df["STATUS"].value_counts()

Unnamed: 0_level_0,count
STATUS,Unnamed: 1_level_1
1,34294
0,5


In [None]:
application_df["SPECIAL_CONSIDERATIONS"].value_counts()

Unnamed: 0_level_0,count
SPECIAL_CONSIDERATIONS,Unnamed: 1_level_1
N,34272
Y,27


In [None]:
# Drop the non-beneficial ID columns, 'EIN' (leaving in 'NAME' this time).
#Since STATUS and SPECIAL_CONSIDERATIONS only have two unique values that are overwhelmingly the same for each variable, we will drop them to try and improve optimization.
application_df = application_df.drop(["EIN","STATUS", "SPECIAL_CONSIDERATIONS"], axis=1)

In [None]:
application_df.head()

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


In [None]:
# Determine the (new) number of unique values in each column.
application_df.nunique()

Unnamed: 0,0
NAME,19568
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
INCOME_AMT,9
ASK_AMT,8747
IS_SUCCESSFUL,2


In [None]:
#Looking at the NAME values for binning
application_names = application_df["NAME"].value_counts()
application_names

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
OUR PERFECT STORM,1
FOUNTAIN THEATER,1
COMBAT VETERANS MOTORCYCLE ASSOCIATION 33-7,1
NEW MEXICO ASSOCIATION OF INDEPENDENT SCHOOLS INC,1


In [None]:
#How many name counts are greater than 5?
application_names[application_names>5]

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
QUAIL FEDERATION INC,6
DEPARTMENT OF OREGON LADIES AUXILLARY TO THE VFW OF THE US,6
EAST VIEW SPORTS COALITION,6
FULBRIGHT ASSOCIATION INC,6


In [None]:
# Choose a cutoff value and create a list of classifications to be replaced
  #I have selected a cutoff value of 5, any occurrence of 5 names or less will be binned
# use the variable name `classifications_to_replace`
names_to_replace = list(application_names[application_names <= 5].index)

# Replace in dataframe
for name in names_to_replace:
    application_df['NAME'] = application_df['NAME'].replace(name,"Other")

# Check to make sure binning was successful
application_df['NAME'].value_counts()

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
Other,20043
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
...,...
SOCIETY FOR CREATIVE ANACHRONISM,6
CBMC INC,6
FAMILY CAREER AND COMMUNITY LEADERS OF AMERICA INC,6
NATIONAL CHARITY LEAGUE INC,6


In [None]:
# Look at APPLICATION_TYPE value counts for binning
application_counts = application_df["APPLICATION_TYPE"].value_counts()
# Choose a cutoff value and create a list of application types to be replaced
  #I am choosing a cutoff value of  500.
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_counts[application_counts < 500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [None]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df["CLASSIFICATION"].value_counts()
# Choose a cutoff value and create a list of classifications to be replaced
  #I have selected a cutoff value of 1000
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
application_with_dummies_df = pd.get_dummies(application_df)
application_with_dummies_df.head()

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,NAME_ALABAMA FEDERATION OF WOMENS CLUBS,NAME_ALABAMA TREASURE FOREST ASSOCIATION,NAME_ALBANY STATE UNIVERSITY NATIONAL ALUMNI ASSOCIATION,NAME_ALPHA PHI OMEGA,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,108590,1,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,5000,0,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,6692,1,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,False,False
4,142590,1,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


In [None]:
# Split our preprocessed data into our features and target arrays
X = application_with_dummies_df.drop(["IS_SUCCESSFUL"], axis="columns").values
y = application_with_dummies_df["IS_SUCCESSFUL"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#to improve optimization, I added 10 nodes to layer 2, and a third sigmoid layer with 20 nodes
num_input = len(X_train[0])
layer1Nodes = 80
layer2Nodes = 40
layer3Nodes = 20
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=layer1Nodes, input_dim=num_input, activation='relu')
)

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units=layer2Nodes, input_dim=layer1Nodes, activation='relu')
)

#Third hidden layer
nn.add(
    tf.keras.layers.Dense(units=layer3Nodes, input_dim=layer2Nodes, activation='sigmoid')
)

# Output layer
nn.add(
    tf.keras.layers.Dense(units=1, input_dim=layer3Nodes, activation='sigmoid')
)


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7335 - loss: 0.5340
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7971 - loss: 0.4319
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7976 - loss: 0.4297
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7983 - loss: 0.4243
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7968 - loss: 0.4241
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7996 - loss: 0.4193
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8005 - loss: 0.4222
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8036 - loss: 0.4149
Epoch 9/100
[1m804/804[0m [32

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 2ms/step - accuracy: 0.7844 - loss: 0.4524
Loss: 0.4523961842060089, Accuracy: 0.7843731641769409


In [None]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharityModel2(Optimized).h5")



# Step 4: Report

1.	Overview of the analysis: Explain the purpose of this analysis.
* The purpose of this analysis was to take a large datasest with several variables and use them to develop a model to predict whether a charity candidate would find success in their initiatives, to help the client (a nonprofit) to decide which candidates to fund. This analysis uses data manipulation and neural networks in different configurations to create a model with >75% accuracy. The optimization process also provides insight into which variables and neural network layer activation types might contribute to the accuracy of the final model.
----

2.	Results: Using bulleted lists and images to support your answers, address the following questions:

o	Data Preprocessing

*   What variable(s) are the target(s) for your model?
  - The IS_SUCCESSFUL variable is the target for both models.
* What variable(s) are the features for your model?
  - The variables for the model are EIN, NAME, APPLICATION_TYPE, AFFILIATION, USE_CASE, ORGANIZATION, STATUS, INCOME_AMT, SPECIAL_CONSIDERATIONS, and ASK_AMT.
* 	What variable(s) should be removed from the input data because they are neither targets nor features? What did we remove from step 1 and step 2?
  - For Model 1, I removed the EIN and NAME variables. In the optimization process of Model 2, I left NAME as a variable, but removed the STATUS and SPECIAL_CONSIDERATIONS variables (in addition to the EIN variable), since they each only have two unique values, one of which comprises the vast majority of the dataset.


o	Compiling, Training, and Evaluating the Model
*	How many neurons, layers, and activation functions did you select for your neural network model, and why?
  - Model 1 (the original model) had two hidden layers and one output layer. There were 80 nodes on the first hidden layer and 30 nodes on the second hidden layer, and both layers used a relu activation. The output layer had one node and a ‘sigmoid’ activation.

* Were you able to achieve the target model performance? What did you do differently in the second file?
  - The first model achieved 72.8% accuracy, which was less than the target accuracy of 75%. In the second model, I dropped different variables and added neurons and layers to increase the accuracy. Model 2 achieved 78.4% accuracy, above the target accuracy of 75%.

* What steps did you take in your attempts to increase model performance?
  - I started by leaving the NAME variable in the dataset, since the name's number of occurrences could be an indicator of whether a campaign could be successful. In addition to the EIN variable, I removed the STATUS and SPECIAL_CONSIDERATIONS variables. For the neural network model, I added another hidden sigmoid layer with 20 nodes, and increased the number of nodes on the second hidden layer from 30 to 40. Thus, the NN structure for Model 2 was HiddenLayer1-80Nodes-Relu, HiddenLayer2-40Nodes-Relu, HiddenLayer3-20Nodes-Sigmoid, and OutputLayer-1Node-Sigmoid.
----


3.	Summary: Summarize the overall results of the deep learning model. Include a recommendation for how a different model could solve this classification problem, and then explain your recommendation.
  * Overall, the accuracy of the deep learning model improved when more superfluous variables were removed and more layers/nodes were added. You could also use a random forest classifier model to solve this classification problem, since that type of model is suited to large datasets of noisy data.

