## Preprocessing

In [46]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [47]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
# Drop the 'EIN' and 'NAME' columns
application_df.drop(columns=['EIN', 'NAME'], inplace=True)

In [48]:
# Determine the number of unique values in each column.
# Determine the number of unique values in each column
unique_values = application_df.nunique()
print(unique_values)

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64


In [49]:
# Determine the value counts for the 'APPLICATION_TYPE' column
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()

# Print the value counts for each application type before any changes
print("Value Counts Before Replacement:")
print(application_type_counts)


Value Counts Before Replacement:
APPLICATION_TYPE
T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: count, dtype: int64


In [50]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
#  YOUR CODE GOES HERE


# Step 3: Define your manual "keep" list, the ones you want to keep
keep_types = ['T3', 'T4', 'T6', 'T5', 'T19', 'T8', 'T7', 'T10']  # Keep these types

# Step 4: Set the cutoff for replacement (e.g., below 1000 occurrences)
cutoff_value = 1000

# Step 5: Identify the application types to replace (those with fewer than cutoff_value occurrences and not in the keep list)
application_types_to_replace = application_type_counts[(application_type_counts < cutoff_value) & 
                                                      (~application_type_counts.index.isin(keep_types))].index

# Step 6: Replace these rare types with 'Other'
application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(application_types_to_replace, "Other")

# Step 7: Print the value counts after replacement
print("\nValue Counts After Replacement:")
print(application_df['APPLICATION_TYPE'].value_counts())



Value Counts After Replacement:
APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: count, dtype: int64


In [51]:
# Look at CLASSIFICATION value counts to identify and replace with "Other"
# Determine the value counts for the 'APPLICATION_TYPE' column
classification_type_counts = application_df['CLASSIFICATION'].value_counts()

# Print the value counts for each application type before any changes
print("Value Counts Before Replacement:")
print(classification_type_counts)


Value Counts Before Replacement:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: count, Length: 71, dtype: int64


In [52]:
# You may find it helpful to look at CLASSIFICATION value counts >1
# Step 1: Get the value counts of 'CLASSIFICATION'
classification_counts = application_df['CLASSIFICATION'].value_counts()

# Step 2: Filter for counts greater than 1
classification_counts_filtered = classification_counts[classification_counts > 1]

# Step 3: Print filtered value counts
print("CLASSIFICATION value counts > 1:")
print(classification_counts_filtered)


CLASSIFICATION value counts > 1:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
C7000      777
C1700      287
C4000      194
C5000      116
C1270      114
C2700      104
C2800       95
C7100       75
C1300       58
C1280       50
C1230       36
C1400       34
C7200       32
C2300       32
C1240       30
C8000       20
C7120       18
C1500       16
C1800       15
C6000       15
C1250       14
C8200       11
C1238       10
C1278       10
C1235        9
C1237        9
C7210        7
C2400        6
C1720        6
C4100        6
C1257        5
C1600        5
C1260        3
C2710        3
C0           3
C3200        2
C1234        2
C1246        2
C1267        2
C1256        2
Name: count, dtype: int64


In [53]:
# Choose a cutoff value and create a list of classifications to be replaced


# Step 3: Set a cutoff value (e.g., replace classifications with fewer than 1000 occurrences)
cutoff_value = 1000

# Step 4: Identify the classifications to replace (those with fewer than cutoff_value occurrences)
classifications_to_replace = classification_counts[classification_counts < cutoff_value].index

# Step 5: Replace these rare classifications with 'Other'
application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(classifications_to_replace, "Other")

# Step 6: Print the value counts after replacement
print("\nValue Counts After Replacement:")
print(application_df['CLASSIFICATION'].value_counts())


Value Counts After Replacement:
CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: count, dtype: int64


In [54]:
# Convert categorical data to numeric with `pd.get_dummies`
# Convert categorical columns into dummy/indicator variables
application_df_encoded = pd.get_dummies(application_df)

# Check the first few rows to see the result
application_df_encoded.head()


Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,1,108590,1,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,1,5000,0,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3,1,6692,1,False,False,False,True,False,False,False,...,False,True,False,False,False,False,False,False,True,False
4,1,142590,1,False,False,False,True,False,False,False,...,False,False,True,False,False,False,False,False,True,False


In [55]:
#Convert categorical data to numeric using pd.get_dummies
application_df_encoded = pd.get_dummies(application_df)
from sklearn.model_selection import train_test_split

# Step 1: Define the features (X) and target (y) arrays
X = application_df_encoded.drop('IS_SUCCESSFUL', axis=1)  # All columns except 'IS_SUCCESSFUL' are features
y = application_df_encoded['IS_SUCCESSFUL']  # 'IS_SUCCESSFUL' is the target

# Step 2: Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the datasets to verify
print("Training data shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing data shape (X_test, y_test):", X_test.shape, y_test.shape)


Training data shape (X_train, y_train): (27439, 43) (27439,)
Testing data shape (X_test, y_test): (6860, 43) (6860,)


In [56]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [57]:
import tensorflow as tf

# Define the model - deep neural net
nn = tf.keras.models.Sequential()

# First hidden layer with 128 units (instead of 80)
nn.add(tf.keras.layers.Dense(units=128, activation='relu', input_dim=X_train_scaled.shape[1]))

# Second hidden layer with 64 units (instead of 30)
nn.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Output layer with 1 unit (for binary classification)
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 128)               5632      
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 13953 (54.50 KB)
Trainable params: 13953 (54.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [58]:
# Compile the model
nn.compile(loss='binary_crossentropy',  # For binary classification
           optimizer='adam',             # Using Adam optimizer
           metrics=['accuracy'])        # We want to track accuracy during training


In [None]:
import numpy as np

# Ensure the data is in the correct format (float32)
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)

# Check for missing or infinite values and handle them
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
y_train = np.nan_to_num(y_train, nan=0.0, posinf=0.0, neginf=0.0)

# If you're using a classification task, ensure y_train is one-hot encoded if needed.
# For binary classification, y_train should be a 1D array with values 0 or 1.
# For multi-class classification, it should be a 2D one-hot encoded matrix.
# Here, I'm assuming you're working on a regression problem. If it's classification, additional steps may be needed.

# Check data types and shapes
print("X_train dtype:", X_train.dtype)
print("y_train dtype:", y_train.dtype)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Ensure validation data is in the correct format as well
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)
y_test = np.nan_to_num(y_test, nan=0.0, posinf=0.0, neginf=0.0)

# Re-check data types and shapes for test data
print("X_test dtype:", X_test.dtype)
print("y_test dtype:", y_test.dtype)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Train the model
history = nn.fit(X_train, y_train, 
                 epochs=100,  # The number of times the model will see the entire dataset
                 batch_size=32,  # The number of samples per gradient update
                 validation_data=(X_test, y_test),  # To validate the model during training
                 verbose=2)


X_train dtype: float32
y_train dtype: float32
X_train shape: (27439, 43)
y_train shape: (27439,)
X_test dtype: float32
y_test dtype: float32
X_test shape: (6860, 43)
y_test shape: (6860,)
Epoch 1/100
858/858 - 1s - loss: 82204.9062 - accuracy: 0.5032 - val_loss: 130374.5781 - val_accuracy: 0.4659 - 1s/epoch - 2ms/step
Epoch 2/100
858/858 - 1s - loss: 42539.3828 - accuracy: 0.4902 - val_loss: 147965.8750 - val_accuracy: 0.5341 - 826ms/epoch - 962us/step
Epoch 3/100
858/858 - 1s - loss: 101200.4531 - accuracy: 0.4947 - val_loss: 258004.4688 - val_accuracy: 0.4659 - 845ms/epoch - 985us/step
Epoch 4/100
858/858 - 1s - loss: 34583.5938 - accuracy: 0.4986 - val_loss: 9184.1270 - val_accuracy: 0.4659 - 883ms/epoch - 1ms/step
Epoch 5/100
858/858 - 1s - loss: 58702.6602 - accuracy: 0.5119 - val_loss: 160567.1719 - val_accuracy: 0.4659 - 827ms/epoch - 964us/step
Epoch 6/100
858/858 - 1s - loss: 44915.1406 - accuracy: 0.5011 - val_loss: 2584.0457 - val_accuracy: 0.5907 - 818ms/epoch - 954us/step


In [15]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")



215/215 - 0s - loss: 0.7718 - accuracy: 0.4649 - 468ms/epoch - 2ms/step
Loss: 0.7718218564987183, Accuracy: 0.46486881375312805


In [16]:
# Export the trained model to an HDF5 file
model_filename = 'trained_model.h5'  # Specify the desired filename for the model
nn.save(model_filename)

print(f"Model saved to {model_filename}")


Model saved to trained_model.h5


  saving_api.save_model(
