In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns 

# Feature selection 
<a class="anchor" id="#1"></a>

In [4]:
target = pd.read_csv('target.csv')
num = pd.read_csv('numerical.csv')
cat = pd.read_csv('categorical.csv')

In [6]:
all_data = pd.concat((num,target),axis=1)
# all_data.head()
donations_data = all_data[all_data['TARGET_B']==1] #numerical data for the people that have donated 
X = donations_data.drop(columns=['TARGET_B','TARGET_D'])
y = donations_data['TARGET_D']


In [7]:
num = num[['RFA_2F','CARDGIFT','HVP1','ETH2','RP1','NGIFTALL','HV1'
           ,'ETHC5','CARDPROM','NUMPROM','RP3','CLUSTER2','INCOME',
           'IC15','CONTROLN','HHAS4','HC6','POBC2','MHUC1']]

In [8]:
data_imbalanced = pd.concat([num, target.TARGET_B],axis=1)

In [9]:
data_imbalanced_2 = pd.concat([num, target.TARGET_D],axis=1)

# Imbalanced target
<a class="anchor" id="#3"></a>

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

####
# train - test - split
#####
X_train, X_test, y_train, y_test = train_test_split(num, target.TARGET_B, test_size=0.25)

####
# scaling Xtrain in scaler
#####
transformer = MinMaxScaler().fit(X_train)
## transforming Xtrain, Xtest
num_train_minmax = transformer.transform(X_train)
num_test_minmax = transformer.transform(X_test)

#####
# fitting train set in model
######
classifier = LogisticRegression(max_iter = 1000)
classifier.fit(X_train, y_train)

####
# metrics
#####
display(classifier.score(X_train, y_train))
display(classifier.score(X_test, y_test))

0.9501110971366286

0.9465895275227435

# Treating imbalance
<a class="anchor" id="#4"></a>

In [12]:
data = pd.concat([num,target.TARGET_B],axis=1)
data.TARGET_B.value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [14]:
data.head()

Unnamed: 0,RFA_2F,CARDGIFT,HVP1,ETH2,RP1,NGIFTALL,HV1,ETHC5,CARDPROM,NUMPROM,RP3,CLUSTER2,INCOME,IC15,CONTROLN,HHAS4,HC6,POBC2,MHUC1,TARGET_B
0,4,14,2,1,2,31,479,0,27,74,17,39,5,4,95515,1,31,74,6,0
1,2,1,97,0,91,3,5468,0,12,32,91,1,6,2,148535,3,97,39,20,0
2,4,14,0,2,0,27,497,1,26,63,9,60,3,20,15078,11,50,84,9,0
3,4,7,10,0,1,16,1000,0,27,66,17,41,1,35,172556,20,39,67,16,0
4,2,8,0,98,25,37,576,58,43,113,74,26,3,21,7112,14,6,65,6,0


In [15]:
from sklearn.utils import resample
####
# splitting target values
#####
category_0 = data[data.TARGET_B == 0]  # would be train set, 0= majority
category_1 = data[data.TARGET_B == 1]  # minority

####
# downsampling
######
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))
print(category_0_undersampled.shape)
print(category_1.shape)
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

#####
# upsampling
######
category_1_oversampled = resample(category_1, 
                                  replace=True, # copying existing column to reach max columns
                                  n_samples = len(category_0))
print(category_1_oversampled.shape)
print(category_0.shape)
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

(4843, 20)
(4843, 20)
(90569, 20)
(90569, 20)


# Logistic Regression with balanced target
<a class="anchor" id="#5"></a>

In [16]:
def log_reg(df):
    
    ####
    # x - y - split
    #####
    features = df.drop(['TARGET_B'],axis=1)
    target = df['TARGET_B']
    
    ####
    # train - test - split
    #####
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25)
    
    ####
    # scaling Xtrain in scaler
    #####
    transformer = MinMaxScaler().fit(X_train)
    num_train_minmax = transformer.transform(X_train)
    num_test_minmax = transformer.transform(X_test)
    
    #####
    # fitting train set in model
    ######
    classifier = LogisticRegression(max_iter = 1337)
    classifier.fit(X_train, y_train)
    
    ####
    # metrics
    #####
    score_1 = classifier.score(X_train, y_train)
    score_2 = classifier.score(X_test, y_test)
    
    return print('train',score_1,'\n''test',score_2,'\n')

In [17]:
log_reg(data_imbalanced)
log_reg(data_downsampled)
log_reg(data_upsampled)

train 0.9489372405986668 
test 0.9498176330021381 

train 0.5404735682819384 
test 0.555739058629232 

train 0.5347912817530713 
test 0.5343049574914431 



In [211]:
# from sklearn.model_selection import RandomizedSearchCV

# max_iter = [1000,1337,2000,10000,20000]
# solver =  ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

# random_grid = {'max_iter': max_iter, 'solver': solver}
# model = LogisticRegression()
# random_search = RandomizedSearchCV(estimator = model,
#                                    param_distributions = random_grid,
#                                    n_iter=10, # number of random combinations
#                                    cv = 5,
#                                    n_jobs = 10) # multithreading cpus



In [212]:
# random_search.fit(X_train,y_train)
# print(random_search.best_params_)
# print(random_search.best_score_)

# Random Forests
<a class="anchor" id="#6"></a>

In [18]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import confusion_matrix

data_upsampled.head()

Unnamed: 0,RFA_2F,CARDGIFT,HVP1,ETH2,RP1,NGIFTALL,HV1,ETHC5,CARDPROM,NUMPROM,RP3,CLUSTER2,INCOME,IC15,CONTROLN,HHAS4,HC6,POBC2,MHUC1,TARGET_B
0,4,14,2,1,2,31,479,0,27,74,17,39,5,4,95515,1,31,74,6,0
1,2,1,97,0,91,3,5468,0,12,32,91,1,6,2,148535,3,97,39,20,0
2,4,14,0,2,0,27,497,1,26,63,9,60,3,20,15078,11,50,84,9,0
3,4,7,10,0,1,16,1000,0,27,66,17,41,1,35,172556,20,39,67,16,0
4,2,8,0,98,25,37,576,58,43,113,74,26,3,21,7112,14,6,65,6,0


In [19]:
def forest(df):
    
    ####
    # x - y - split
    #####
    features = df.drop(['TARGET_B'],axis=1)
    target = df['TARGET_B']
    
    ####
    # train - test - split
    #####
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25)
    
    ####
    # scaling Xtrain in scaler
    #####
    transformer = MinMaxScaler().fit(X_train)
    num_train_minmax = transformer.transform(X_train)
    num_test_minmax = transformer.transform(X_test)
    
    #####
    # fitting train set in model
    ######
    classifier = RandomForestClassifier(max_depth=5, # max number of questions
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8) # fraction of original dataset
                             
    classifier.fit(X_train, y_train)
    
    ####
    # metrics
    #####
    score_1 = classifier.score(X_train, y_train)
    score_2 = classifier.score(X_test, y_test)
    
    ####
    # predictions
    ####
    y_pred = classifier.predict(X_test)
    
    return print('train',score_1,'\n''test',score_2,'\n''\n','confunsion matrix','\n''\n',confusion_matrix(y_test, y_pred),'\n')

In [20]:
forest(data_upsampled)

train 0.6045357850029075 
test 0.5995583526554047 

 confunsion matrix 

 [[13777  8812]
 [ 9322 13374]] 



## Random Forest Regressor

In [21]:
data_imbalanced_2.head()

Unnamed: 0,RFA_2F,CARDGIFT,HVP1,ETH2,RP1,NGIFTALL,HV1,ETHC5,CARDPROM,NUMPROM,RP3,CLUSTER2,INCOME,IC15,CONTROLN,HHAS4,HC6,POBC2,MHUC1,TARGET_D
0,4,14,2,1,2,31,479,0,27,74,17,39,5,4,95515,1,31,74,6,0.0
1,2,1,97,0,91,3,5468,0,12,32,91,1,6,2,148535,3,97,39,20,0.0
2,4,14,0,2,0,27,497,1,26,63,9,60,3,20,15078,11,50,84,9,0.0
3,4,7,10,0,1,16,1000,0,27,66,17,41,1,35,172556,20,39,67,16,0.0
4,2,8,0,98,25,37,576,58,43,113,74,26,3,21,7112,14,6,65,6,0.0


In [22]:
donor = data_imbalanced_2[data_imbalanced_2.TARGET_D>0]
len(donor)

4843

In [23]:
def forest_2(df):
    
    ####
    # x - y - split
    #####
    features = df.drop(['TARGET_D'],axis=1)
    target = df['TARGET_D']
    
    ####
    # train - test - split
    #####
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25)
    
    ####
    # scaling Xtrain in scaler
    #####
    transformer = MinMaxScaler().fit(X_train)
    num_train_minmax = transformer.transform(X_train)
    num_test_minmax = transformer.transform(X_test)
    
    #####
    # fitting train set in model
    ######
    classifier = RandomForestRegressor(max_depth=5, # max number of questions
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8) # fraction of original dataset
                             
    classifier.fit(X_train, y_train)
    
    ####
    # metrics
    #####
    score_1 = classifier.score(X_train, y_train)
    score_2 = classifier.score(X_test, y_test)
    
    ####
    # predictions
    ####
    pred = classifier.predict(X_test)
    
    return print('train',score_1,'\n''test',score_2,'\n''\n','predicted amount''\n',np.mean(pred))

In [24]:
forest_2(donor)

train 0.23203407969698564 
test 0.16884112160575082 

 predicted amount
 15.470438744343241


In [25]:
mailing_cost = 0.18
all_pop = len(target)
only_donors = len(donor)
av_donation_donors = 15.628022799265295

In [26]:
revenue_all = (only_donors * av_donation_donors) - (all_pop * mailing_cost)
print(round(revenue_all,2),'USD DOLLARS')


58512.35 USD DOLLARS


In [27]:
from sklearn.model_selection import cross_val_score
def vali(df):
    
    features = df.drop(['TARGET_D'],axis=1)
    target = df['TARGET_D']
    
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25)

    
    clf = RandomForestRegressor(max_depth=5,
                                 min_samples_split=20,
                                 min_samples_leaf =20)
    
    cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
    return print(np.mean(cross_val_scores))


In [28]:
vali(donor)

0.17208259504166645


In [None]:
# num_2