## Preprocessing

In [17]:
# STEP 1
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
# STEP 2
#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [18]:
# STEP 3
# # Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

In [19]:
# STEP 4
# # Determine the number of unique values in each column.
unique_values = application_df.nunique()

In [20]:
# STEP 5
# # Display the number of unique values for each column
print("\nNumber of Unique Values per Column:")
print(unique_values)



Number of Unique Values per Column:
APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [21]:
# STEP 6
# # Look at APPLICATION_TYPE value counts to identify and replace with "Other"
# Use the `value_counts()` method to count the occurrences of each unique value in the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# STEP 7
# # Display the value counts for 'APPLICATION_TYPE'
print("\nValue Counts for 'APPLICATION_TYPE':")
print(application_type_counts)


Value Counts for 'APPLICATION_TYPE':
APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64


In [22]:
# STEP 8
# # Choose a cutoff value of 500 and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
# Explanation:
# - Define a threshold (cutoff value) to determine which 'APPLICATION_TYPE' categories are considered rare.
# - Categories with counts below this threshold will be replaced with 'Other'.
# - This helps in reducing the number of categories, simplifying the dataset, and improving model performance.

# Define the cutoff value
threshold = 500

# Create a list of application types that have fewer than 'threshold' occurrences
# - application_type_counts < threshold: Creates a boolean Series where True indicates counts below the threshold
# - [application_type_counts < threshold]: Filters the Series to include only those rows where the condition is True
# - .index.tolist(): Extracts the index (application type names) of the filtered Series and converts it to a list
application_types_to_replace = application_type_counts[application_type_counts < threshold].index.tolist()

#

# Display the list of application types to be replaced with 'Other'
print("\nApplication Types to be Replaced with 'Other':")
print(application_types_to_replace)


Application Types to be Replaced with 'Other':
['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']


In [23]:
# STEP 9
# # Explanation:
# - Replace all occurrences of rare application types (identified in 'application_types_to_replace') with the label 'Other'.
# - This consolidates infrequent categories into a single category, reducing noise and simplifying the dataset.

# Replace the rare application types with 'Other' in the DataFrame
# - 'APPLICATION_TYPE' is the column we're modifying
# - 'application_types_to_replace' is the list of values to be replaced
# - 'Other' is the new value that replaces the rare categories

In [24]:
# STEP 10
# # Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure replacement was successful
# Verify the changes by displaying the updated 'APPLICATION_TYPE' value counts
print("\nUpdated 'APPLICATION_TYPE' Value Counts After Replacement:")
print(application_df['APPLICATION_TYPE'].value_counts())


Updated 'APPLICATION_TYPE' Value Counts After Replacement:
APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64


In [25]:
# Step 11: Examine the 'CLASSIFICATION' Value Counts

# Use the `value_counts()` method to count the occurrences of each unique value in the 'CLASSIFICATION' column
classification_counts = application_df['CLASSIFICATION'].value_counts()
# Display the value counts for 'CLASSIFICATION'
print("\nValue Counts for 'CLASSIFICATION':")
print(classification_counts)


Value Counts for 'CLASSIFICATION':
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64


In [26]:
#step 12
#  You may find it helpful to look at CLASSIFICATION value counts >1
#  Identifying classifications that have more than 1 occurrence
# Filters the classifications to include only those with more than one occurrence, which can help in identifying meaningful classes for the model and handling rare classes appropriately.
classification_counts_gt1 = classification_counts[classification_counts > 1]

# Displaying the classifications with counts greater than 1
print("\n'CLASSIFICATION' Categories with More Than 1 Occurrence:")
print(classification_counts_gt1)


'CLASSIFICATION' Categories with More Than 1 Occurrence:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64


In [33]:
# Step 13:
#  Choose a cutoff value of 100 and create a list of classifications to be replaced
# We'll use this list to replace less frequent classifications with a common category, such as "Other".
# This helps in reducing the complexity of the model by minimizing the number of unique classes.

# Define the cutoff value
cutoff = 1000  # Any classification with fewer than 100 occurrences will be replaced


classifications_to_replace = classification_counts_gt1[classification_counts_gt1 < cutoff].index.tolist()

# Display the list of classifications that will be replaced with 'Other'
print("\nClassifications to be Replaced with 'Other':")
print(classifications_to_replace)



Classifications to be Replaced with 'Other':
['C7000', 'C1700', 'C4000', 'C5000', 'C1270', 'C2700', 'C2800', 'C7100', 'C1300', 'C1280', 'C1230', 'C1400', 'C7200', 'C2300', 'C1240', 'C8000', 'C7120', 'C1500', 'C1800', 'C6000', 'C1250', 'C8200', 'C1238', 'C1278', 'C1235', 'C1237', 'C7210', 'C2400', 'C1720', 'C4100', 'C1257', 'C1600', 'C1260', 'C2710', 'C0', 'C3200', 'C1234', 'C1246', 'C1267', 'C1256']


In [34]:
#Step 14
# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64

In [35]:
# Convert categorical data to numeric with `pd.get_dummies`
# Step 15: Convert Categorical Data to Numeric Using One-Hot Encoding

# Use pd.get_dummies() to convert categorical columns into numeric columns
# This function automatically handles categorical columns by converting them to a binary (0 or 1) format
# The 'drop_first=True' argument is used to avoid the dummy variable trap, but here include all columns

application_df_encoded = pd.get_dummies(application_df)

# Display the first few rows of the newly encoded dataframe
# This will show how the categorical columns have been transformed into numerical ones
print("\nEncoded DataFrame (First 5 Rows):")
print(application_df_encoded.head())



Encoded DataFrame (First 5 Rows):
   STATUS  ASK_AMT  IS_SUCCESSFUL  APPLICATION_TYPE_Other  \
0       1     5000              1                   False   
1       1   108590              1                   False   
2       1     5000              0                   False   
3       1     6692              1                   False   
4       1   142590              1                   False   

   APPLICATION_TYPE_T10  APPLICATION_TYPE_T19  APPLICATION_TYPE_T3  \
0                  True                 False                False   
1                 False                 False                 True   
2                 False                 False                False   
3                 False                 False                 True   
4                 False                 False                 True   

   APPLICATION_TYPE_T4  APPLICATION_TYPE_T5  APPLICATION_TYPE_T6  ...  \
0                False                False                False  ...   
1                False         

In [36]:
# Step 16: Split the Data into Features (X) and Target (y)

# The target variable is assumed. 'IS_SUCCESSFUL' is the target column.
#  
y = application_df_encoded['IS_SUCCESSFUL'].values  # Target variable

# The features will include all other columns except for the target
X = application_df_encoded.drop(columns=['IS_SUCCESSFUL']).values  # Feature variables

# Split the Data into Training and Testing Sets

# Use train_test_split to split the data into training and testing sets
# test_size=0.2 means 20% of the data will be used for testing, and 80% for training
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets to verify the split
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Target Shape:", y_train.shape)
print("Testing Target Shape:", y_test.shape)


Training Features Shape: (27439, 43)
Testing Features Shape: (6860, 43)
Training Target Shape: (27439,)
Testing Target Shape: (6860,)


In [37]:
#Step 17:   Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [38]:
# Step 1 
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#  initialize the mode

nn = tf.keras.models.Sequential()

# First hidden layer
# The number of input features is determined by the shape of X_train_scaled (the number of features)
# Adjust the number of neurons as necessary; here, we'll use 30 neurons and ReLU activation
nn.add(tf.keras.layers.Dense(units=30, activation='relu', input_dim=X_train_scaled.shape[1]))

# Second hidden layer
# Adding another hidden layer with 15 neurons
nn.add(tf.keras.layers.Dense(units=15, activation='relu'))

# Output layer
# The output layer has 1 neuron since we are predicting a binary outcome (success/failure)
# Using 'sigmoid' activation function because this is a binary classification problem
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [40]:
#Step 2 
# Compile the model
nn.compile(loss='binary_crossentropy', 
            optimizer='adam', 
            metrics=['accuracy'])

In [41]:
# Step 3: Train the Model

# Fit the model to the training data
# Specify the number of epochs and batch size as needed
# epochs: number of complete passes through the training dataset
# batch_size: number of samples processed before the model is updated

history = nn.fit(X_train_scaled, 
                 y_train, 
                 epochs=100,          # Adjust the number of epochs as necessary
                 batch_size=32,       # Adjust the batch size as necessary
                 validation_data=(X_test_scaled, y_test),  # Use validation data to monitor performance
                 verbose=1)          # Verbose set to 1 to display training progress


Epoch 1/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6744 - loss: 0.6166 - val_accuracy: 0.7241 - val_loss: 0.5658
Epoch 2/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 752us/step - accuracy: 0.7306 - loss: 0.5530 - val_accuracy: 0.7262 - val_loss: 0.5619
Epoch 3/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 788us/step - accuracy: 0.7295 - loss: 0.5549 - val_accuracy: 0.7243 - val_loss: 0.5631
Epoch 4/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 778us/step - accuracy: 0.7289 - loss: 0.5564 - val_accuracy: 0.7280 - val_loss: 0.5608
Epoch 5/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784us/step - accuracy: 0.7313 - loss: 0.5494 - val_accuracy: 0.7274 - val_loss: 0.5617
Epoch 6/100
[1m858/858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 758us/step - accuracy: 0.7318 - loss: 0.5481 - val_accuracy: 0.7259 - val_loss: 0.5595
Epoch 7/100


In [42]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 0s - 529us/step - accuracy: 0.7276 - loss: 0.5629
Loss: 0.5628734827041626, Accuracy: 0.7275510430335999


In [38]:
# Export our model to HDF5 file
#  YOUR CODE GOES HERE