In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Import our input dataset
df = pd.read_csv('charity_data.csv')
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# # Computing IQR
# Q1 = df['ASK_AMT'].quantile(0.25)
# Q3 = df['ASK_AMT'].quantile(0.75)
# IQR = Q3 - Q1
# # Filtering Values between Q1-1.5IQR and Q3+1.5IQR
# df = df.query('(@Q1 - 1.5 * @IQR) <= ASK_AMT <= (@Q3 + 1.5 * @IQR)')
# df.describe()

Unnamed: 0,EIN,STATUS,ASK_AMT,IS_SUCCESSFUL
count,26093.0,26093.0,26093.0,26093.0
mean,510406700.0,0.999808,5082.225348,0.526425
std,240275400.0,0.013842,593.63794,0.499311
min,10520600.0,0.0,5000.0,0.0
25%,274821300.0,1.0,5000.0,0.0
50%,464865400.0,1.0,5000.0,1.0
75%,743237600.0,1.0,5000.0,1.0
max,996015800.0,1.0,11854.0,1.0


In [3]:
# test dropping the income amount due to high level of 0 rows
# df=df.drop("INCOME_AMT",axis=1)

In [4]:
# drop non-useful columns
# df = df.drop(["EIN","NAME","STATUS","SPECIAL_CONSIDERATIONS"],axis=1)
df = df.drop(["EIN","NAME"],axis=1)
df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
5,T3,Independent,C1200,Preservation,Trust,1,0,N,5000,1
9,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0


In [5]:
# generate list of categorical variables
cat = df.dtypes[df.dtypes=="object"].index.tolist()
cat

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [6]:
# check number of unique values in each category for grouping/encoding
df[cat].nunique()

APPLICATION_TYPE          12
AFFILIATION                6
CLASSIFICATION            65
USE_CASE                   5
ORGANIZATION               4
INCOME_AMT                 9
SPECIAL_CONSIDERATIONS     2
dtype: int64

In [7]:
# check counts of each unique value in the category
app_counts = df["APPLICATION_TYPE"].value_counts()
class_counts = df["CLASSIFICATION"].value_counts()
income_counts = df["INCOME_AMT"].value_counts()
income_counts

## Application Types with less than 500 entries will be grouped
## Classification with less than 200 entries will be grouped
## Income will not be grouped at this time

0                24332
25000-99999        657
1-9999             504
100000-499999      317
10000-24999        261
1M-5M               15
50M+                 3
10M-50M              2
5M-10M               2
Name: INCOME_AMT, dtype: int64

In [8]:
# Trying Ordinal encoding for Income Amount
from sklearn.preprocessing import OrdinalEncoder
ord_list = ['0','1-9999','10000-24999','25000-99999','100000-499999', \
           '1M-5M','5M-10M','10M-50M','50M+']
ord_enc = OrdinalEncoder()

ord_enc.fit(df["INCOME_AMT"].values.reshape(-1,1))
ord_enc.categories_

[array(['0', '1-9999', '10000-24999', '100000-499999', '10M-50M', '1M-5M',
        '25000-99999', '50M+', '5M-10M'], dtype=object)]

In [9]:
ord_dict = {
    '0':1,
    '1-9999':2,
    '10000-24999':3,
    '25000-99999':4,
    '100000-499999':5,
    '1M-5M':6,
    '5M-10M':7,
    '10M-50M':8,
    '50M+':9
}
df["INCOME_AMT_ORD"] = df["INCOME_AMT"].map(ord_dict)
df = df.drop("INCOME_AMT", axis=1)

In [10]:
# Determine which values to replace
replace_apps = list(app_counts[app_counts < 100].index)
replace_class = list(class_counts[class_counts < 100].index)

# Replace in DataFrame
for i in replace_apps:
    df["APPLICATION_TYPE"] = df["APPLICATION_TYPE"].replace(i,"Other")
for i in replace_class:
    df["CLASSIFICATION"] = df["CLASSIFICATION"].replace(i,"Other")

In [11]:
df["IS_SUCCESSFUL"].value_counts()

1    13736
0    12357
Name: IS_SUCCESSFUL, dtype: int64

In [12]:
# encode all categorical variables with OneHotEncoders
enc = OneHotEncoder(sparse=False)

# rerun the category definer
cat = df.dtypes[df.dtypes=="object"].index.tolist()

# fit and then produce the encoder
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,AFFILIATION_CompanySponsored,...,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [13]:
# Merge the encoded columns then drop the original
df_merge = df.merge(encode_df,left_index=True,right_index=True)
df_merge = df_merge.drop(cat,1)
df_merge.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,INCOME_AMT_ORD,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,...,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev,ORGANIZATION_Association,ORGANIZATION_Co-operative,ORGANIZATION_Corporation,ORGANIZATION_Trust,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1,5000,0,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1,6692,1,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
5,1,5000,1,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
9,1,5000,0,1,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [14]:
# split features and targets
y = df_merge["IS_SUCCESSFUL"]

X = df_merge.copy()
X = X.drop(["IS_SUCCESSFUL"],axis=1)

In [15]:
# split training and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24)

In [16]:
# scale all the encoded data for both train and test sets
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [17]:
# Start the configuration of the Deep Learning Model

In [18]:
# define the model
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = len(X_train_scaled[0]) * 3
hidden_nodes_layer2 = len(X_train_scaled[0]) * 2

nn = Sequential()

# First hidden layer
nn.add(Dense(units=hidden_nodes_layer1,input_dim=number_input_features,
                         activation="relu")
)

# Second hidden Layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# output layer
nn.add(Dense(units=1, activation="sigmoid"))

#check structure of model
nn.summary()#

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 117)               4680      
_________________________________________________________________
dense_1 (Dense)              (None, 78)                9204      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 79        
Total params: 13,963
Trainable params: 13,963
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Train on 15031 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
5011/5011 - 1s - loss: 0.7052 - accuracy: 0.5328
Loss: 0.7052012963628987, Accuracy: 0.5328277945518494


In [20]:
# #### MODEL SCENARIO 2

# # define the model
# number_input_features = len(X_train_scaled[0])
# hidden_nodes_layer1 = len(X_train_scaled[0]) * 3
# hidden_nodes_layer2 = len(X_train_scaled[0]) * 2

# nn = Sequential()

# # First hidden layer
# nn.add(Dense(units=hidden_nodes_layer1,input_dim=number_input_features,
#                          activation="relu")
# )

# # Second hidden Layer
# nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

# # output layer
# nn.add(Dense(units=1, activation="tanh"))

# # Compile the Sequential model together and customize metrics
# nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Train the model
# fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# # Evaluate the model using the test data
# model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
# print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")