In [27]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adagrad, RMSprop, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

# Import our input dataset
df = pd.read_csv('charity_data.csv')
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# understand grouping 
np.median(df["ASK_AMT"])

5000.0

In [3]:
# clean with drop na and duplicates
df = df.drop_duplicates()
df = df.dropna()
df.describe()

Unnamed: 0,EIN,STATUS,ASK_AMT,IS_SUCCESSFUL
count,34299.0,34299.0,34299.0,34299.0
mean,519185200.0,0.999854,2769199.0,0.532406
std,245147200.0,0.012073,87130450.0,0.498956
min,10520600.0,0.0,5000.0,0.0
25%,274848200.0,1.0,5000.0,0.0
50%,465631700.0,1.0,5000.0,1.0
75%,752611700.0,1.0,7742.0,1.0
max,996086900.0,1.0,8597806000.0,1.0


In [4]:
# test dropping the income amount due to high level of 0 rows
# df=df.drop("INCOME_AMT",axis=1)

In [5]:
# drop non-useful columns
# df = df.drop(["EIN","NAME","STATUS","SPECIAL_CONSIDERATIONS"],axis=1)
df = df.drop(["EIN","NAME"],axis=1)
df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [6]:
# generate list of categorical variables
cat = df.dtypes[df.dtypes=="object"].index.tolist()
cat

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [7]:
# check number of unique values in each category for grouping/encoding
df[cat].nunique()

APPLICATION_TYPE          17
AFFILIATION                6
CLASSIFICATION            71
USE_CASE                   5
ORGANIZATION               4
INCOME_AMT                 9
SPECIAL_CONSIDERATIONS     2
dtype: int64

In [8]:
# check counts of each unique value in the category
app_counts = df["APPLICATION_TYPE"].value_counts()
class_counts = df["CLASSIFICATION"].value_counts()
income_counts = df["INCOME_AMT"].value_counts()
class_counts

## Application Types with less than 500 entries will be grouped
## Classification with less than 200 entries will be grouped
## Income will not be grouped at this time

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C1248        1
C2170        1
C1728        1
C4120        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64

In [9]:
# Trying Ordinal encoding for Income Amount
from sklearn.preprocessing import OrdinalEncoder
ord_list = ['0','1-9999','10000-24999','25000-99999','100000-499999', \
           '1M-5M','5M-10M','10M-50M','50M+']
ord_enc = OrdinalEncoder()

ord_enc.fit(df["INCOME_AMT"].values.reshape(-1,1))

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [10]:
ord_dict = {
    '0':1,
    '1-9999':2,
    '10000-24999':3,
    '25000-99999':4,
    '100000-499999':5,
    '1M-5M':6,
    '5M-10M':7,
    '10M-50M':8,
    '50M+':9
}

# Testing a different way to encode income amt

# df["INCOME_AMT_ORD"] = df["INCOME_AMT"].map(ord_dict)
# df = df.drop("INCOME_AMT", axis=1)

In [11]:
# Determine which values to replace
replace_apps = list(app_counts[app_counts < 100].index)
replace_class = list(class_counts[class_counts < 100].index)

# Replace in DataFrame
for i in replace_apps:
    df["APPLICATION_TYPE"] = df["APPLICATION_TYPE"].replace(i,"Other")
for i in replace_class:
    df["CLASSIFICATION"] = df["CLASSIFICATION"].replace(i,"Other")

In [12]:
df["IS_SUCCESSFUL"].value_counts()

1    18261
0    16038
Name: IS_SUCCESSFUL, dtype: int64

In [13]:
# encode all categorical variables with OneHotEncoders
enc = OneHotEncoder(sparse=False)

# rerun the category definer
cat = df.dtypes[df.dtypes=="object"].index.tolist()

# fit and then produce the encoder
encode_df = pd.DataFrame(enc.fit_transform(df[cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(cat)
encode_df.head()

Unnamed: 0,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,APPLICATION_TYPE_T7,APPLICATION_TYPE_T8,APPLICATION_TYPE_T9,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# Merge the encoded columns then drop the original
df_merge = df.merge(encode_df,left_index=True,right_index=True)
df_merge = df_merge.drop(cat,1)
df_merge.head()

Unnamed: 0,STATUS,ASK_AMT,IS_SUCCESSFUL,APPLICATION_TYPE_Other,APPLICATION_TYPE_T10,APPLICATION_TYPE_T19,APPLICATION_TYPE_T3,APPLICATION_TYPE_T4,APPLICATION_TYPE_T5,APPLICATION_TYPE_T6,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,1,5000,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,108590,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,5000,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,6692,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,142590,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
df_merge.columns

Index(['STATUS', 'ASK_AMT', 'IS_SUCCESSFUL', 'APPLICATION_TYPE_Other',
       'APPLICATION_TYPE_T10', 'APPLICATION_TYPE_T19', 'APPLICATION_TYPE_T3',
       'APPLICATION_TYPE_T4', 'APPLICATION_TYPE_T5', 'APPLICATION_TYPE_T6',
       'APPLICATION_TYPE_T7', 'APPLICATION_TYPE_T8', 'APPLICATION_TYPE_T9',
       'AFFILIATION_CompanySponsored', 'AFFILIATION_Family/Parent',
       'AFFILIATION_Independent', 'AFFILIATION_National', 'AFFILIATION_Other',
       'AFFILIATION_Regional', 'CLASSIFICATION_C1000', 'CLASSIFICATION_C1200',
       'CLASSIFICATION_C1270', 'CLASSIFICATION_C1700', 'CLASSIFICATION_C2000',
       'CLASSIFICATION_C2100', 'CLASSIFICATION_C2700', 'CLASSIFICATION_C3000',
       'CLASSIFICATION_C4000', 'CLASSIFICATION_C5000', 'CLASSIFICATION_C7000',
       'CLASSIFICATION_Other', 'USE_CASE_CommunityServ', 'USE_CASE_Heathcare',
       'USE_CASE_Other', 'USE_CASE_Preservation', 'USE_CASE_ProductDev',
       'ORGANIZATION_Association', 'ORGANIZATION_Co-operative',
       'ORGANIZATION

In [16]:
# split features and targets
y = df_merge["IS_SUCCESSFUL"]

X = df_merge.copy()
X = X.drop(["IS_SUCCESSFUL"],axis=1)

In [17]:
# split training and test
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=24)

In [18]:
# scale all the encoded data for both train and test sets
scaler = StandardScaler()
# scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [26]:
# Start the configuration of the Deep Learning Model
X_train_scaled[0].shape

(50,)

In [28]:
# define the model
num_input = len(X_train_scaled[0])
neurons_layer1 = len(X_train_scaled[0]) * 1.5
neurons_layer2 = len(X_train_scaled[0]) 
neurons_layer3 = len(X_train_scaled[0]) / 1.5

# Start creating the model inputs, layers and outputs

inputs = Input(shape=X_train_scaled[0].shape)
branchA = Dense(neurons_layer1, activation="relu")(inputs)

In [None]:
# test changing different hyperparameters

opt = SGD(lr=0.01, momentum=0.9, decay=0.01)
opt = Adam()

# rlrop = ReduceLROnPlateau(monitor='loss',factor=0.1,patience=25)

In [None]:
# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=200)
# fit_model = nn.fit(X_train_scaled, y_train, epochs=300, callbacks=[rlrop])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")