In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

#### import the data

In [2]:
numerical = pd.read_csv('./files_for_lab/numerical.csv')
categorical = pd.read_csv('./files_for_lab/categorical.csv')
target = pd.read_csv('./files_for_lab/target.csv')

In [3]:
#numerical

In [4]:
#categorical

In [5]:
#categorical.dtypes

In [6]:
categorical = categorical.astype(object)

In [7]:
#categorical.dtypes

In [8]:
#target

#### Apply the Random Forests algorithm but this time only by upscaling the data.

##### first do the train test split

In [9]:
all_data = pd.concat([numerical, categorical], axis=1)

In [99]:
#all_data

In [10]:
X = all_data
y = target['TARGET_B']

In [11]:
#X

In [12]:
#y

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [14]:
X_train_num = X_train.select_dtypes(np.number)
X_test_num = X_test.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object)
X_test_cat = X_test.select_dtypes(object)

##### Scale and OneHot Encode the data

In [15]:
#Usage of MinMax Scaler
scaler = MinMaxScaler().fit(X_train_num)

In [16]:
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

In [17]:
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=list(X_train_num.columns))
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=list(X_test_num.columns))

In [18]:
#Usage of One-Hot Encoding
encoder = OneHotEncoder(drop='first', handle_unknown = 'ignore').fit(X_train_cat)
all_columns = encoder.get_feature_names_out(input_features=X_train_cat.columns)
X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(), columns=all_columns)
X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(), columns=all_columns)



In [19]:
X_train = pd.concat((X_train_num_scaled,X_train_cat_encode),axis=1)
X_test = pd.concat((X_test_num_scaled,X_test_cat_encode),axis=1)

In [20]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

##### Oversampling the train data

In [21]:
trainset = pd.concat([X_train, y_train], axis=1)

In [22]:
# separate majority/minority classes
category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

In [23]:
from sklearn.utils import resample
category_1_oversampled = resample(category_1, 
                                    replace=True,            
                                    n_samples = len(category_0))  

In [24]:
print(category_1_oversampled.shape)
print(category_0.shape)

(67928, 629)
(67928, 629)


In [25]:
train_oversampled = pd.concat([category_1_oversampled, category_0], axis=0)

In [26]:
trainset_new = train_oversampled.sample(frac =1)

In [27]:
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [28]:
clf_first = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.2) # fraction of X-train to use in each tree
clf_first.fit(X_train, y_train)
print(clf_first.score(X_train, y_train))
print(clf_first.score(X_test, y_test))

y_pred = clf_first.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.627164056059357
0.6021464805265585


0    22641
1     1212
Name: TARGET_B, dtype: int64

array([[13698,  8943],
       [  547,   665]], dtype=int64)

In [29]:
# For cross validation
clf_first = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2)
cross_val_scores = cross_val_score(clf_first, X_train, y_train, cv=5)
print(np.mean(cross_val_scores))

0.6192806827349127


In [30]:
cross_val_scores

array([0.62284705, 0.6184903 , 0.61433146, 0.62194987, 0.61878473])

#### Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

##### First try: RFE

In [31]:
X_RFE = numerical
y_RFE = target['TARGET_B']

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=25, verbose=False)
rfe.fit(X_RFE, y_RFE)

RFE(estimator=LinearRegression(), n_features_to_select=25, verbose=False)

In [32]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance.
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(numerical).columns
df[df['Rank']==1]
#df['Rank'].value_counts()

Unnamed: 0,Rank,Column_name
2,1,INCOME
16,1,POP90C1
17,1,POP90C2
18,1,POP90C3
30,1,ETH10
76,1,DW3
77,1,DW4
78,1,DW5
92,1,HHD1
99,1,HHD8


In [33]:
# For building a new model, I just use these numerical columns + my categorical ones:
num_list = ['INCOME', 'POP90C1', 'ETH10', 'DW3', 'HHD1', 'RHP3', 'HUPA1', 'MC1', 'TPE3', 'OCC6', 'HC15', 'CARDGIFT', 'HPHONE_D'
           , 'RFA_2F']

In [34]:
X_train_rfe = X_train[num_list]
display(X_train_rfe)

Unnamed: 0,INCOME,POP90C1,ETH10,DW3,HHD1,RHP3,HUPA1,MC1,TPE3,OCC6,HC15,CARDGIFT,HPHONE_D,RFA_2F
19361,0.833333,0.969697,0.000000,0.030303,0.474747,0.262295,0.030303,0.424242,0.000000,0.000000,0.0,0.195122,1.0,0.000000
66281,0.666667,0.959596,0.000000,0.010101,0.656566,0.311475,0.010101,0.434343,0.030303,0.000000,0.0,0.146341,0.0,0.333333
7646,0.166667,1.000000,0.000000,0.020202,0.232323,0.213115,0.030303,0.131313,0.000000,0.000000,0.0,0.097561,1.0,0.333333
34223,0.833333,1.000000,0.043478,0.000000,0.212121,0.213115,0.060606,0.525253,0.060606,0.000000,0.0,0.024390,1.0,0.000000
28413,0.666667,0.797980,0.000000,0.010101,0.414141,0.229508,0.080808,0.595960,0.000000,0.000000,0.0,0.195122,0.0,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37172,0.666667,1.000000,0.043478,0.000000,0.363636,0.229508,0.202020,0.636364,0.010101,0.000000,0.0,0.048780,0.0,0.000000
47985,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4982,0.666667,0.000000,0.000000,0.020202,0.393939,0.229508,0.070707,0.424242,0.000000,0.023256,0.0,0.121951,0.0,0.333333
26629,0.666667,0.000000,0.000000,0.000000,0.434343,0.245902,0.000000,0.313131,0.000000,0.000000,0.0,0.073171,1.0,0.000000


In [35]:
X_test_rfe = X_test[num_list]
display(X_test_rfe)

Unnamed: 0,INCOME,POP90C1,ETH10,DW3,HHD1,RHP3,HUPA1,MC1,TPE3,OCC6,HC15,CARDGIFT,HPHONE_D,RFA_2F
0,0.666667,1.0,0.043478,0.000000,0.323232,0.229508,0.000000,0.393939,0.030303,0.000000,0.000000,0.097561,0.0,1.000000
1,0.666667,1.0,0.000000,0.030303,0.292929,0.213115,0.050505,0.585859,0.020202,0.000000,0.000000,0.390244,0.0,0.000000
2,0.166667,0.0,0.000000,0.030303,0.414141,0.245902,0.050505,0.454545,0.000000,0.023256,0.000000,0.121951,1.0,0.333333
3,0.166667,0.0,0.000000,0.000000,0.393939,0.229508,0.010101,0.373737,0.000000,0.000000,0.000000,0.097561,0.0,0.000000
4,0.666667,0.0,0.000000,0.000000,0.404040,0.245902,0.000000,0.202020,0.020202,0.000000,0.000000,0.146341,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23848,0.666667,1.0,0.021739,0.010101,0.181818,0.196721,0.111111,0.575758,0.010101,0.023256,0.000000,0.121951,0.0,0.000000
23849,0.500000,1.0,0.000000,0.121212,0.343434,0.213115,0.383838,0.363636,0.020202,0.000000,0.000000,0.097561,0.0,1.000000
23850,0.333333,0.0,0.000000,0.010101,0.292929,0.229508,0.010101,0.494949,0.000000,0.000000,0.000000,0.292683,0.0,0.666667
23851,0.333333,0.0,0.000000,0.010101,0.424242,0.229508,0.060606,0.333333,0.000000,0.046512,0.033333,0.268293,0.0,0.000000


In [36]:
#Append categorical columns
X_train_rfe = pd.concat([X_train_rfe, X_train_cat_encode], axis = 1)
X_test_rfe = pd.concat([X_test_rfe, X_test_cat_encode], axis = 1)

In [37]:
X_train_rfe

Unnamed: 0,INCOME,POP90C1,ETH10,DW3,HHD1,RHP3,HUPA1,MC1,TPE3,OCC6,...,FIRSTDATE_MM_3,FIRSTDATE_MM_4,FIRSTDATE_MM_5,FIRSTDATE_MM_6,FIRSTDATE_MM_7,FIRSTDATE_MM_8,FIRSTDATE_MM_9,FIRSTDATE_MM_10,FIRSTDATE_MM_11,FIRSTDATE_MM_12
19361,0.833333,0.969697,0.000000,0.030303,0.474747,0.262295,0.030303,0.424242,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66281,0.666667,0.959596,0.000000,0.010101,0.656566,0.311475,0.010101,0.434343,0.030303,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7646,0.166667,1.000000,0.000000,0.020202,0.232323,0.213115,0.030303,0.131313,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
34223,0.833333,1.000000,0.043478,0.000000,0.212121,0.213115,0.060606,0.525253,0.060606,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
28413,0.666667,0.797980,0.000000,0.010101,0.414141,0.229508,0.080808,0.595960,0.000000,0.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37172,0.666667,1.000000,0.043478,0.000000,0.363636,0.229508,0.202020,0.636364,0.010101,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47985,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4982,0.666667,0.000000,0.000000,0.020202,0.393939,0.229508,0.070707,0.424242,0.000000,0.023256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26629,0.666667,0.000000,0.000000,0.000000,0.434343,0.245902,0.000000,0.313131,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
X_test_rfe

Unnamed: 0,INCOME,POP90C1,ETH10,DW3,HHD1,RHP3,HUPA1,MC1,TPE3,OCC6,...,FIRSTDATE_MM_3,FIRSTDATE_MM_4,FIRSTDATE_MM_5,FIRSTDATE_MM_6,FIRSTDATE_MM_7,FIRSTDATE_MM_8,FIRSTDATE_MM_9,FIRSTDATE_MM_10,FIRSTDATE_MM_11,FIRSTDATE_MM_12
0,0.666667,1.0,0.043478,0.000000,0.323232,0.229508,0.000000,0.393939,0.030303,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.666667,1.0,0.000000,0.030303,0.292929,0.213115,0.050505,0.585859,0.020202,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.166667,0.0,0.000000,0.030303,0.414141,0.245902,0.050505,0.454545,0.000000,0.023256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.166667,0.0,0.000000,0.000000,0.393939,0.229508,0.010101,0.373737,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666667,0.0,0.000000,0.000000,0.404040,0.245902,0.000000,0.202020,0.020202,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23848,0.666667,1.0,0.021739,0.010101,0.181818,0.196721,0.111111,0.575758,0.010101,0.023256,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23849,0.500000,1.0,0.000000,0.121212,0.343434,0.213115,0.383838,0.363636,0.020202,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
23850,0.333333,0.0,0.000000,0.010101,0.292929,0.229508,0.010101,0.494949,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23851,0.333333,0.0,0.000000,0.010101,0.424242,0.229508,0.060606,0.333333,0.000000,0.046512,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


##### Rebuild RandomForest Model

In [39]:
clf_RFE_class1 = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20) # ultimate answer based on at least this many rows
clf_RFE_class1.fit(X_train_rfe, y_train)
print(clf_RFE_class1.score(X_train_rfe, y_train))
print(clf_RFE_class1.score(X_test_rfe, y_test))
y_pred = clf_RFE_class1.predict(X_test_rfe)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6062890118949475
0.6003018488240472


0    22641
1     1212
Name: TARGET_B, dtype: int64

array([[13639,  9002],
       [  532,   680]], dtype=int64)

In [40]:
# For cross validation
clf_RFE_class1_cross = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf_RFE_class1_cross, X_train_rfe, y_train, cv=5)
print(np.mean(cross_val_scores))

0.6037347866844571


In [41]:
#Did not improve the model. Next try: Just use the selected 14 numerical features without the categorical ones
X_train_rfe_num = X_train[num_list]
X_test_rfe_num = X_test[num_list]

In [42]:
clf_RFE_class_num = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20) # ultimate answer based on at least this many rows
clf_RFE_class_num.fit(X_train_rfe_num, y_train)
print(clf_RFE_class_num.score(X_train_rfe_num, y_train))
print(clf_RFE_class_num.score(X_test_rfe_num, y_test))

y_pred = clf_RFE_class_num.predict(X_test_rfe_num)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.5915380991638205
0.6215151134029263


0    22641
1     1212
Name: TARGET_B, dtype: int64

array([[14186,  8455],
       [  573,   639]], dtype=int64)

In [43]:
#y_train.value_counts()

In [44]:
# For cross validation
clf_RFE_class_num_cross = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf_RFE_class_num_cross, X_train_rfe_num, y_train, cv=5)
print(np.mean(cross_val_scores))

0.5940112734391031


In [45]:
#No improvement by just using the 14 numericals selected with RFE as features.

##### to be continued

### Lab Final Regression Model

In [46]:
donated_all = pd.concat([all_data, target], axis = 1)

In [47]:
donated_all

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


In [48]:
donated = donated_all[donated_all['TARGET_B'] == 1]

In [49]:
donated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,2,62.000000,3,8,10,2,25,40,27,11,...,88,1,94,4,96,3,87,1,1,4.0
30,0,61.611649,5,9,0,1,37,58,16,8,...,90,4,93,1,95,12,90,4,1,7.0
45,0,66.000000,5,9,5,0,33,24,39,6,...,93,12,94,4,96,2,87,4,1,5.0
78,0,69.000000,6,9,0,0,34,20,54,2,...,90,1,95,3,95,11,90,1,1,13.0
93,1,73.000000,1,7,10,0,21,53,8,5,...,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,89,6,96,1,96,1,86,8,1,20.0
95309,0,51.000000,5,6,1,1,32,43,24,7,...,93,10,94,2,95,12,93,10,1,15.0
95398,0,86.000000,5,9,0,1,32,21,26,9,...,89,6,95,11,96,2,87,11,1,3.0
95403,0,58.000000,4,9,0,0,24,46,20,6,...,90,3,93,12,96,1,90,3,1,10.0


In [50]:
donated_numerical = donated.select_dtypes(np.number)
donated_cat = donated.select_dtypes(object)

In [51]:
donated_numerical = donated_numerical.drop(['TARGET_B', 'TARGET_D'], axis = 1)
donated_numerical

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
20,2,62.000000,3,8,10,2,25,40,27,11,...,10,2.00,7.0,5.0,12,4.066667,82943,1,3,3
30,0,61.611649,5,9,0,1,37,58,16,8,...,6,2.00,10.0,7.0,9,6.181818,190313,1,3,14
45,0,66.000000,5,9,5,0,33,24,39,6,...,14,3.00,6.0,5.0,3,4.857143,76585,1,3,11
78,0,69.000000,6,9,0,0,34,20,54,2,...,5,5.00,17.0,10.0,21,11.000000,156378,0,2,2
93,1,73.000000,1,7,10,0,21,53,8,5,...,8,5.00,12.0,12.0,6,9.400000,25641,1,3,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,16,0.07,17.0,17.0,7,7.935667,154544,0,1,52
95309,0,51.000000,5,6,1,1,32,43,24,7,...,2,5.00,15.0,15.0,4,11.666667,171302,1,1,20
95398,0,86.000000,5,9,0,1,32,21,26,9,...,4,5.00,25.0,20.0,15,14.400000,78831,0,3,3
95403,0,58.000000,4,9,0,0,24,46,20,6,...,6,3.00,20.0,20.0,10,11.583333,84678,0,1,56


In [52]:
#donated_cat

#### Apply RFE for feature selection for numericals

In [53]:
X = donated_numerical
y = donated['TARGET_D']

from sklearn.feature_selection import RFE
from sklearn import linear_model
lm = linear_model.LinearRegression()
rfe = RFE(lm, n_features_to_select=25, verbose=False)
rfe.fit(X, y)

RFE(estimator=LinearRegression(), n_features_to_select=25, verbose=False)

In [54]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance.
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(donated_numerical).columns
df[df['Rank']<26]
#df['Rank'].value_counts()

Unnamed: 0,Rank,Column_name
16,1,POP90C1
17,1,POP90C2
18,1,POP90C3
19,24,POP90C4
20,23,POP90C5
21,1,ETH1
22,1,ETH2
25,1,ETH5
33,1,ETH13
34,1,ETH14


In [55]:
numerical_features_selected = ['POP90C1', 'ETH1', 'HHN2', 'DW9', 'HU2', 'HHD1', 'ETHC1', 'HUPA7', 'MC1', 'TPE7', 'ANC15', 
                              'HC15', 'MINRAMNT', 'LASTGIFT', 'AVGGIFT', 'RFA_2F']

In [56]:
#donated_num_feat_RFE = donated_numerical[numerical_features_selected].copy()

In [57]:
#donated_num_feat_RFE

#### Building Regressor Model

In [58]:
X_RFE = donated_numerical
y_RFE = donated['TARGET_D']

X_train, X_test, y_train, y_test = train_test_split(X_RFE, y_RFE, test_size=0.25)

#Usage of MinMax Scaler

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=list(X_train.columns))
X_test_scaled = pd.DataFrame(X_test_scaled, columns=list(X_test.columns))

In [59]:
X_train = X_train_scaled[numerical_features_selected].copy()
X_test = X_test_scaled[numerical_features_selected].copy()

y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

In [60]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10)
regr.fit(X_train, y_train)
print(regr.score(X_train, y_train))
print(regr.score(X_test, y_test))

y_pred = regr.predict(X_test)

0.6128269242609163
0.535630560710773


In [61]:
y_pred

array([17.61853207, 16.55190785, 11.81663979, ...,  8.57598076,
       15.39876175, 15.82407865])

In [62]:
# assume X_test is the test set for the classification model
# assume clf is the fitted classification model
# assume model is the fitted regression model

# predict probability of donating for each person in X_test
#proba = clf.predict_proba(X_test_scaled)[:, 1]  # use positive class probability

# filter people who are predicted to donate
#donors = X_test_scaled[proba > 0.5]

# predict how much they will donate using the regression model
#X_donor = donors.drop(columns=['TARGET_B', 'TARGET_D'])
#X_donor_const = sm.add_constant(X_donor)
#predicted_donations = model.predict(X_donor_const)

#### Using now both models (classifier and regressor on complete dataset)

In [63]:
#Classifier Model
#clf_RFE_class_num

#Regressor Model
#regr

#Scaler
#scaler

In [64]:
#num_list # for classifier

In [65]:
#numerical_features_selected # for regression

In [66]:
X = all_data
y_B = target['TARGET_B']
y_D = target['TARGET_D']

In [67]:
X_numerical = X.select_dtypes(np.number)
X_scaled = scaler.transform(X_numerical)
X_scaled = pd.DataFrame(X_scaled, columns=list(X_numerical.columns))

In [68]:
#X_scaled

In [69]:
X_class = X_scaled[num_list].copy()
#X_regr = X_scaled[numerical_features_selected].copy()

#### Predict if donation or not

In [70]:
pred_donation = clf_RFE_class_num.predict(X_class)

In [71]:
pred_donation

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

#### append prediction if donation yes or no to complete dataset

In [74]:
X_scaled_donated = X_scaled.copy()

In [75]:
X_scaled_donated['pred_TARGET_B'] = list(pred_donation)

In [76]:
X_scaled_donated

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,pred_TARGET_B
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.140845,...,0.005,0.001401,0.010,0.003676,0.006387,0.498045,0.0,1.000000,0.622951,1
1,0.000014,0.463918,0.833333,1.000000,0.066390,0.000000,0.151515,0.555556,0.111111,0.084507,...,0.010,0.004004,0.025,0.016544,0.014323,0.774510,0.0,0.333333,0.000000,0
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.084507,...,0.002,0.002202,0.005,0.011029,0.006126,0.078617,1.0,1.000000,0.967213,1
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.042254,...,0.002,0.001201,0.010,0.008272,0.005456,0.899764,1.0,1.000000,0.655738,1
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.366197,...,0.003,0.002002,0.015,0.012868,0.005509,0.037079,1.0,0.333333,0.409836,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000014,0.624862,0.666667,1.000000,0.000000,0.141414,0.363636,0.474747,0.111111,0.098592,...,0.025,0.004004,0.025,0.008272,0.023669,0.962399,0.0,0.000000,0.180328,0
95408,0.000014,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,0.191919,0.056338,...,0.020,0.003003,0.020,0.008272,0.018662,0.639828,1.0,0.000000,0.016393,0
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.098592,...,0.003,0.001001,0.010,0.002757,0.006932,0.988852,1.0,0.666667,0.540984,1
95410,0.000000,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,0.202020,0.126761,...,0.005,0.003203,0.018,0.003676,0.010797,0.024466,1.0,1.000000,0.163934,1


#### Filter for people who are predicted to donate

In [77]:
people_pred_donate = X_scaled_donated[X_scaled_donated['pred_TARGET_B'] == 1]

In [79]:
#people_pred_donate

#### Predict how much these people will donate

In [80]:
X_regr = people_pred_donate[numerical_features_selected].copy()

In [82]:
pred_amount = regr.predict(X_regr)

In [83]:
pred_amount

array([11.83678773,  8.0990692 ,  8.96134234, ...,  3.72559147,
       10.4475895 , 18.08504928])

#### append predicted amount to dataset of people who are predicted to donate

In [84]:
people_pred_donate['pred_amount'] = list(pred_amount)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  people_pred_donate['pred_amount'] = list(pred_amount)


In [85]:
people_pred_donate

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,pred_TARGET_B,pred_amount
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.140845,...,0.001401,0.010,0.003676,0.006387,0.498045,0.0,1.000000,0.622951,1,11.836788
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.084507,...,0.002202,0.005,0.011029,0.006126,0.078617,1.0,1.000000,0.967213,1,8.099069
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.042254,...,0.001201,0.010,0.008272,0.005456,0.899764,1.0,1.000000,0.655738,1,8.961342
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.366197,...,0.002002,0.015,0.012868,0.005509,0.037079,1.0,0.333333,0.409836,1,17.028023
7,0.000000,0.624862,0.166667,1.000000,0.004149,0.000000,0.343434,0.363636,0.121212,0.042254,...,0.001201,0.011,0.005515,0.004843,0.570232,1.0,0.666667,0.606557,1,9.621518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95402,0.000000,0.624862,0.666667,1.000000,0.000000,0.000000,0.202020,0.212121,0.636364,0.098592,...,0.001201,0.011,0.001838,0.008398,0.249997,0.0,0.666667,0.852459,1,11.485537
95404,0.000028,0.536082,0.000000,1.000000,0.000000,0.000000,0.242424,0.313131,0.282828,0.126761,...,0.001201,0.008,0.002757,0.004543,0.303356,0.0,1.000000,0.852459,1,7.775549
95406,0.000000,0.505155,0.833333,0.666667,0.041494,0.020202,0.444444,0.262626,0.464646,0.352113,...,0.000801,0.002,0.002757,0.002014,0.182956,0.0,1.000000,0.688525,1,3.725591
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.098592,...,0.001001,0.010,0.002757,0.006932,0.988852,1.0,0.666667,0.540984,1,10.447590


#### Calculate sum of predicted donations

In [86]:
sum_of_donations = people_pred_donate['pred_amount'].sum()

In [87]:
sum_of_donations

476011.93829966313

#### Calculate costs for sending just these people cards who are predicted to donate

In [88]:
costs_mails = 0.86 * len(people_pred_donate)

In [89]:
costs_mails

31207.68

In [90]:
diff_costs_donations = sum_of_donations - costs_mails

In [91]:
diff_costs_donations

444804.25829966314

#### Actual number of people who had a 1 as value in TARGET_B column

In [97]:
len(target[target['TARGET_B'] == 1])

4843

#### Actual amount of donations (values in TARGET_D)

In [98]:
target['TARGET_D'].sum()

75668.7

#### Conclusion
The classifier model predicted that from the original 95412 people, 36288 will likely gonna make a prediction (default probability of 50%). If the company just sends cards to these 36288 people, the amount of donations after substracting the costs for mail will be 444 804.25$ (amount of donation per person was predicted by the regression model). 

The interpretation above was made if one assumes that both models are acurate. But if one checks how many people actually donated compared to the number of people predicted who will donate (4843 persons vs 36288 persons), it is clear that the model greatly overestimates the number of people who will probably donate. 