## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, LeakyReLU

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns = ['EIN', 'NAME'])

In [3]:
# Determine the number of unique values in each column.
application_df.nunique()

Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [4]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [5]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = ['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [6]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [7]:
# You may find it helpful to look at CLASSIFICATION value counts >1
application_df['CLASSIFICATION'].value_counts().loc[lambda x : x>1]

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
C7000,777
C1700,287
C4000,194
C5000,116
C1270,114


In [8]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = ['C7000', 'C1700', 'C4000', 'C5000', 'C1270', 'C2700',
                              'C2800', 'C7100', 'C1300', 'C1280', 'C1230', 'C1400', 'C7200',
                              'C2300', 'C1240', 'C8000', 'C7120', 'C1500', 'C1800', 'C6000',
                              'C1250', 'C8200', 'C1238', 'C1278', 'C1235', 'C1237', 'C7210',
                              'C2400', 'C1720', 'C4100', 'C1257', 'C1600', 'C1260', 'C2710',
                              'C0', 'C3200', 'C1234', 'C1246', 'C1267', 'C1256', 'C2190',
                              'C4200', 'C2600', 'C5200', 'C1370', 'C1248', 'C6100', 'C1820',
                              'C1900', 'C1236', 'C3700', 'C2570', 'C1580', 'C6100', 'C1245',
                              'C2500', 'C1570', 'C1283', 'C2380', 'C1732', 'C1728', 'C2170',
                              'C4120', 'C8210', 'C2561', 'C4500', 'C2150']
# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [9]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical features to integers
categorical_cols = ['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE', 'ORGANIZATION', 'SPECIAL_CONSIDERATIONS']

for col in categorical_cols:
    encoder = LabelEncoder()
    application_df[col] = encoder.fit_transform(application_df[col])

In [10]:
# Define the desired number of percentile bins (e.g., 10 for deciles)
num_bins = 50

# Calculate percentiles
percentiles = np.linspace(0, 100, num_bins + 1)  # Create percentiles from 0 to 200
quantiles = application_df['ASK_AMT'].quantile(percentiles / 100).to_list()

# Define bin edges using quantiles (adjust as needed)
#bins = [0] + quantiles + [np.inf]  # Include 0 and infinity as edges, adjust as needed
bins = quantiles


# Create a new column 'ASK_AMT_BIN' using pd.cut
application_df['ASK_AMT_BIN'] = pd.cut(application_df['ASK_AMT'], bins=bins, labels=False, include_lowest=True, duplicates='drop')

application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ASK_AMT_BIN
0,1,2,0,4,0,1,0,0,5000,1,0
1,3,2,2,3,1,1,1-9999,0,108590,1,6
2,5,0,4,4,0,1,0,0,5000,0,0
3,3,0,2,3,3,1,10000-24999,0,6692,1,0
4,3,2,0,1,3,1,100000-499999,0,142590,1,7


In [11]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df, dtype=float)
application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ASK_AMT_BIN,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,1,2,0,4,0,1,0,5000,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,2,2,3,1,1,0,108590,1,6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0,4,4,0,1,0,5000,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,2,3,3,1,0,6692,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,2,0,1,3,1,0,142590,1,7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [12]:
application_df.columns

Index(['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE',
       'ORGANIZATION', 'STATUS', 'SPECIAL_CONSIDERATIONS', 'ASK_AMT',
       'IS_SUCCESSFUL', 'ASK_AMT_BIN', 'INCOME_AMT_0', 'INCOME_AMT_1-9999',
       'INCOME_AMT_10000-24999', 'INCOME_AMT_100000-499999',
       'INCOME_AMT_10M-50M', 'INCOME_AMT_1M-5M', 'INCOME_AMT_25000-99999',
       'INCOME_AMT_50M+', 'INCOME_AMT_5M-10M'],
      dtype='object')

In [13]:
# Define midpoint values for each income range
income_midpoints = {
    "INCOME_AMT_0": 0,
    "INCOME_AMT_1-9999": 5000,
    "INCOME_AMT_10000-24999": 17500,
    "INCOME_AMT_25000-99999": 62500,
    "INCOME_AMT_100000-499999": 300000,
    "INCOME_AMT_1M-5M": 3000000,
    "INCOME_AMT_5M-10M": 7500000,
    "INCOME_AMT_10M-50M": 30000000,
    "INCOME_AMT_50M+": 50000000
}

# Multiply one-hot encoded columns by their respective midpoint values
numeric_income = application_df[list(income_midpoints.keys())].mul(pd.Series(income_midpoints))

# Sum across columns to get a single numeric value per row
application_df["INCOME_AMT_NUMERIC"] = numeric_income.sum(axis=1)

# Drop the one-hot encoded columns (optional)
application_df.drop(columns=list(income_midpoints.keys()), inplace=True)

application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ASK_AMT_BIN,INCOME_AMT_NUMERIC
0,1,2,0,4,0,1,0,5000,1,0,0.0
1,3,2,2,3,1,1,0,108590,1,6,5000.0
2,5,0,4,4,0,1,0,5000,0,0,0.0
3,3,0,2,3,3,1,0,6692,1,0,17500.0
4,3,2,0,1,3,1,0,142590,1,7,300000.0


In [14]:
application_df.columns

Index(['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE',
       'ORGANIZATION', 'STATUS', 'SPECIAL_CONSIDERATIONS', 'ASK_AMT',
       'IS_SUCCESSFUL', 'ASK_AMT_BIN', 'INCOME_AMT_NUMERIC'],
      dtype='object')

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
application_df['ASK_AMT_SCALED'] = scaler.fit_transform(application_df[['ASK_AMT']])

In [16]:
application_df.columns

Index(['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'USE_CASE',
       'ORGANIZATION', 'STATUS', 'SPECIAL_CONSIDERATIONS', 'ASK_AMT',
       'IS_SUCCESSFUL', 'ASK_AMT_BIN', 'INCOME_AMT_NUMERIC', 'ASK_AMT_SCALED'],
      dtype='object')

In [17]:
# Split our preprocessed data into our features and target arrays
y = application_df['IS_SUCCESSFUL'].values
X = application_df.drop(['IS_SUCCESSFUL', 'ASK_AMT_BIN', 'ASK_AMT'], axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [19]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 70
hidden_nodes_layer2 = 50
hidden_nodes_layer3 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
# First hidden layer with LeakyReLU activation
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features))
nn.add(LeakyReLU())  # Apply LeakyReLU as a separate layer

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [21]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6948 - loss: 0.6077
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7203 - loss: 0.5779
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7292 - loss: 0.5637
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7290 - loss: 0.5620
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7263 - loss: 0.5642
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7308 - loss: 0.5562
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7357 - loss: 0.5550
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7303 - loss: 0.5554
Epoch 9/100
[1m804/804[0m [32

In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 2ms/step - accuracy: 0.7325 - loss: 0.5558
Loss: 0.5557504296302795, Accuracy: 0.732478141784668


In [23]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimization.h5")

