In [173]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

### Function Definitions

In [174]:
def transform(X,y,random_state=0,verbose=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    
    X_train_num = X_train.select_dtypes(np.number)
    X_test_num = X_test.select_dtypes(np.number)

    X_train_cat = X_train.select_dtypes(object)
    X_test_cat = X_test.select_dtypes(object)

    transformer=MinMaxScaler().fit(X_train_num)
    
    X_train_num_scaled = transformer.transform(X_train_num)
    X_train_num_scaled = pd.DataFrame(X_train_num_scaled,columns=X_train_num.columns)

    X_test_num_scaled = transformer.transform(X_test_num)
    X_test_num_scaled = pd.DataFrame(X_test_num_scaled,columns=X_test_num.columns)

    encoder = OneHotEncoder().fit(X_train_cat)
    
    X_train_cat_encoded = encoder.transform(X_train_cat).toarray()
    cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
    X_train_cat_encoded = pd.DataFrame(X_train_cat_encoded, columns=cols)

    X_test_cat_encoded = encoder.transform(X_test_cat).toarray()
    cols = encoder.get_feature_names_out(input_features=X_test_cat.columns)
    X_test_cat_encoded = pd.DataFrame(X_test_cat_encoded, columns=cols)

    X_train = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
    X_test = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)
    
    if verbose:
        print("X_train_num_scaled:")
        display(X_train_num_scaled.head())
        print("X_train_num_scaled:")
        display(X_test_num_scaled.head())
        print("X_train_cat_encoded:")
        display(X_train_cat_encoded.head())
        print("X_test_cat_encoded:")
        display(X_test_cat_encoded.head())
        print("X_train:")
        display(X_train.head())
        print("X_test:")
        display(X_test.head())
        
    return X_train, X_test, y_train, y_test, transformer, encoder

In [175]:
def model_pipeline(X_train, X_test, y_train, y_test, model_dict):
    
    cross_val_scores = {}
    accuracy_scores = {}
    
    for model_name in model_dict.keys():
        mean_score = np.mean(cross_val_score(model_dict[model_name], X_train, y_train, cv=10))
        cross_val_scores[model_name] = round(mean_score,2)
    print("Cross Validation Scores:")
    [print(model_name," : ",cross_val_scores[model_name]) for model_name in cross_val_scores.keys()]

    for model_name in model_dict.keys():
        model_dict[model_name].fit(X_train, y_train)
        accuracy_scores[model_name] = round(model_dict[model_name].score(X_test,y_test),2)
    print("Accuracy Scores:")
    [print(model_name," : ",accuracy_scores[model_name]) for model_name in accuracy_scores.keys()]

In [176]:
def disp_score_reg(reg,X_train, X_test, y_train, y_test):
    predictions = reg.predict(X_train)
    print("Training Score:")
    print('R2 Score:', round(r2_score(y_train,predictions),2))
    print('Mean Squared Error:', round(mean_squared_error(y_train,predictions),2))
    print('Mean aboslute Error:', round(mean_absolute_error(y_train,predictions),2))
    print('Root Mean Squared Error:',round(sqrt(mean_squared_error(y_train,predictions)),2))
    print()
    print("Test Score:")
    predictions = reg.predict(X_test)
    print('R2 Score:', round(r2_score(y_test,predictions),2))
    print('Mean Squared Error:', round(mean_squared_error(y_test,predictions),2))
    print('Mean aboslute Error:', round(mean_absolute_error(y_test,predictions),2))
    print('Root Mean Squared Error:',round(sqrt(mean_squared_error(y_test,predictions)),2))

Load Data

In [177]:
numerical=pd.read_csv("files_for_lab/numerical.csv")
categorical=pd.read_csv("files_for_lab/categorical.csv")
target=pd.read_csv("files_for_lab/target.csv")
display(numerical.head())
display(categorical.head())
display(target.head())

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [178]:
df_donors=pd.concat([numerical,categorical,target],axis=1)
df_donors.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


Filter only those rows where donation was made

In [179]:
df_donors = df_donors[df_donors['TARGET_B'] == 1]

In [180]:
y=df_donors['TARGET_D']
X=df_donors.drop(['TARGET_B','TARGET_D'], axis=1)
display(X.head())
y.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
20,2,62.0,3,8,10,2,25,40,27,11,...,36,1,88,1,94,4,96,3,87,1
30,0,61.611649,5,9,0,1,37,58,16,8,...,0,2,90,4,93,1,95,12,90,4
45,0,66.0,5,9,5,0,33,24,39,6,...,31,10,93,12,94,4,96,2,87,4
78,0,69.0,6,9,0,0,34,20,54,2,...,28,7,90,1,95,3,95,11,90,1
93,1,73.0,1,7,10,0,21,53,8,5,...,24,10,92,9,95,9,95,9,92,9


20     4.0
30     7.0
45     5.0
78    13.0
93    10.0
Name: TARGET_D, dtype: float64

Train-Test Split and scale / transform the data

In [181]:
X_train, X_test, y_train, y_test, transformer, encoder = transform(X,y,229,True)

X_train_num_scaled:


Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,5.1e-05,0.61289,0.666667,1.0,0.0,0.0,0.341772,0.343434,0.323232,0.208333,...,0.0,0.090909,0.181818,0.818182,0.785714,0.545455,0.5,0.090909,0.571429,0.272727
1,5.1e-05,0.744681,0.666667,0.888889,0.004167,0.0,0.35443,0.353535,0.262626,0.145833,...,0.258065,0.0,0.545455,0.454545,0.928571,0.0,0.5,0.0,0.809524,0.0
2,0.0,0.787234,0.666667,1.0,0.0,0.0,0.481013,0.0,0.515152,0.0625,...,0.215054,0.0,0.818182,0.636364,0.857143,0.0,0.5,0.0,0.619048,0.090909
3,0.0,0.61289,0.666667,1.0,0.0,0.0,0.278481,0.242424,0.323232,0.083333,...,0.0,0.090909,0.545455,0.363636,0.571429,0.909091,0.0,0.909091,0.666667,0.818182
4,5.1e-05,0.61289,0.5,0.666667,0.0,0.020833,0.481013,0.515152,0.151515,0.125,...,0.0,0.090909,0.727273,0.363636,0.857143,0.454545,1.0,0.090909,0.52381,0.909091


X_train_num_scaled:


Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,5.1e-05,0.61289,0.333333,0.444444,0.091667,0.010417,0.139241,0.282828,0.323232,0.208333,...,0.0,0.090909,0.545455,0.272727,0.714286,0.636364,0.5,0.727273,0.809524,0.0
1,0.0,0.606383,0.666667,0.666667,0.029167,0.0,0.379747,0.363636,0.242424,0.145833,...,0.387097,0.909091,0.545455,0.636364,0.357143,0.181818,0.0,1.0,0.571429,0.272727
2,2.6e-05,0.404255,0.5,1.0,0.0,0.0,0.253165,0.292929,0.606061,0.104167,...,0.602151,0.0,0.818182,0.090909,0.857143,1.0,0.0,1.0,0.952381,0.090909
3,5.1e-05,0.61289,0.666667,1.0,0.0,0.208333,0.417722,0.393939,0.242424,0.104167,...,0.0,0.090909,0.363636,0.909091,0.642857,0.272727,0.5,0.0,0.571429,0.454545
4,0.000718,0.340426,0.833333,0.888889,0.0,0.0,0.329114,0.30303,0.323232,0.25,...,0.666667,0.0,0.818182,0.909091,0.857143,0.727273,0.5,0.090909,0.952381,0.727273


X_train_cat_encoded:


Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


X_test_cat_encoded:


Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


X_train:


Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,5.1e-05,0.61289,0.666667,1.0,0.0,0.0,0.341772,0.343434,0.323232,0.208333,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,5.1e-05,0.744681,0.666667,0.888889,0.004167,0.0,0.35443,0.353535,0.262626,0.145833,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.787234,0.666667,1.0,0.0,0.0,0.481013,0.0,0.515152,0.0625,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.61289,0.666667,1.0,0.0,0.0,0.278481,0.242424,0.323232,0.083333,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5.1e-05,0.61289,0.5,0.666667,0.0,0.020833,0.481013,0.515152,0.151515,0.125,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


X_test:


Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,5.1e-05,0.61289,0.333333,0.444444,0.091667,0.010417,0.139241,0.282828,0.323232,0.208333,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.606383,0.666667,0.666667,0.029167,0.0,0.379747,0.363636,0.242424,0.145833,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.6e-05,0.404255,0.5,1.0,0.0,0.0,0.253165,0.292929,0.606061,0.104167,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,5.1e-05,0.61289,0.666667,1.0,0.0,0.208333,0.417722,0.393939,0.242424,0.104167,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.000718,0.340426,0.833333,0.888889,0.0,0.0,0.329114,0.30303,0.323232,0.25,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


Model Comparison



In [182]:
model_dict={
    'Regression Tree': DecisionTreeRegressor(),
    'Linear Regression': LinearRegression(),
    'Random Forest':RandomForestRegressor()
}

In [183]:
model_pipeline(X_train, X_test, y_train, y_test,model_dict)

Cross Validation Scores:
Regression Tree  :  -0.73
Linear Regression  :  -0.0
Random Forest  :  0.41
Accuracy Scores:
Regression Tree  :  -0.31
Linear Regression  :  0.51
Random Forest  :  0.57


In [184]:
len(y_test)

1211

Random Forest Regression Model (Since the Random Forest has a better accuracy score on the test data, we'll chose this model for prediction.

In [185]:
rfr =RandomForestRegressor().fit(X_train, y_train)

Average Donation Amount

In [186]:
disp_score_reg(rfr,X_train, X_test, y_train, y_test)

Training Score:
R2 Score: 0.92
Mean Squared Error: 12.28
Mean aboslute Error: 1.69
Root Mean Squared Error: 3.5

Test Score:
R2 Score: 0.56
Mean Squared Error: 64.66
Mean aboslute Error: 4.25
Root Mean Squared Error: 8.04


In [187]:
round(np.mean(predictions),2)

15.98

As it came down to be complicated for me to run the dataset for classifier developed in the previous lab in this notebook (due to transformer/encoder inconsistency), I wasn't able to run the classfier here, which eventually prevented me to get the classification stats here to be able to calculate the overall donation cost (with inclusion of total postal cost, donations received) using TP, FP, TN, FN. However, I have already did a similar calculation in form of an explanatory example in the previous lab towards the end section.