In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



warnings.filterwarnings('ignore')

In [2]:
numericals = pd.read_csv('numerical.csv')
categoricals = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
numericals.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,7,2.0,11.0,10.0,9,6.8125,172556,1,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26


In [4]:
categoricals.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,...,37,12,92,8,94,2,95,12,89,11
1,CA,14,H,M,3,L,G,A,S,1,...,52,2,93,10,95,12,95,12,93,10
2,NC,43,U,M,3,L,E,C,R,2,...,0,2,91,11,92,7,95,12,90,1
3,CA,44,U,F,3,L,E,C,R,2,...,28,1,87,11,94,11,95,12,87,2
4,FL,16,H,F,3,L,F,A,S,2,...,20,1,93,10,96,1,96,1,79,3


In [5]:
target.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [6]:
data = pd.concat([target, numericals, categoricals], axis = 1)
#pd.concat([df1,df2], axis=1)

In [7]:
data.head()

Unnamed: 0,TARGET_B,TARGET_D,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0,0.0,0,60.0,5,9,0,0,39,34,...,37,12,92,8,94,2,95,12,89,11
1,0,0.0,1,46.0,6,9,16,0,15,55,...,52,2,93,10,95,12,95,12,93,10
2,0,0.0,1,61.611649,3,1,2,0,20,29,...,0,2,91,11,92,7,95,12,90,1
3,0,0.0,0,70.0,1,4,2,0,23,14,...,28,1,87,11,94,11,95,12,87,2
4,0,0.0,0,78.0,3,2,60,1,28,9,...,20,1,93,10,96,1,96,1,79,3


In [8]:
nulls = pd.DataFrame(data.isna().sum()*100/len(data), columns=['percentage'])
nulls.sort_values('percentage', ascending = False).head(315)

Unnamed: 0,percentage
TARGET_B,0.0
OEDC2,0.0
EC3,0.0
EC2,0.0
EC1,0.0
...,...
HU5,0.0
HU4,0.0
HU3,0.0
HU2,0.0


In [14]:
X = data.drop(['TARGET_B','TARGET_D'] , axis = 1)
y = data['TARGET_B']

In [15]:
X = pd.get_dummies(X)
X

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_G,GEOCODE2_A,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,0,60.000000,5,9,0,0,39,34,18,10,...,0,0,0,1,0,0,0,0,1,0
1,1,46.000000,6,9,16,0,15,55,11,6,...,1,1,0,0,0,0,0,1,0,0
2,1,61.611649,3,1,2,0,20,29,33,6,...,0,0,0,1,0,0,1,0,0,0
3,0,70.000000,1,4,2,0,23,14,31,3,...,0,0,0,1,0,0,1,0,0,0
4,0,78.000000,3,2,60,1,28,9,53,26,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,1,0,0,1,0,1,0,0,0,0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,0,1,0,0,0,1,0,0,0,0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,0,0,1,0,0,1,0,0,0,0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,0,1,0,0,0,1,0,0,0,0


In [16]:
y

0        0
1        0
2        0
3        0
4        0
        ..
95407    0
95408    0
95409    0
95410    1
95411    0
Name: TARGET_B, Length: 95412, dtype: int64

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn import linear_model

In [21]:
from sklearn.feature_selection import RFE
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=50, step = 30, verbose=False)
rfe.fit(X_train, y_train)
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = X.columns
cols_to_keep = df[df['Rank']==1]['Column_name']

In [22]:
cols_to_keep

1           AGE
6       MALEVET
7      VIETVETS
8      WWIIVETS
13       POP901
14       POP902
15       POP903
44        CHIL1
72         HHP1
73         HHP2
83          HV1
84          HV2
105       ETHC2
117        HUR2
131         RP3
134         ADI
135         DMA
136         IC1
137         IC2
138         IC3
139         IC4
140         IC5
153        IC18
161       HHAS3
166        TPE1
176        PEC2
178       TPE11
180       TPE13
184        LFC4
185        LFC5
186        LFC6
187        LFC7
207        EIC4
224       OEDC5
230         EC4
248         VC3
266       POBC2
273        VOC3
286        HC13
299    CARDPROM
300     NUMPROM
302    NUMPRM12
303    RAMNTALL
304    NGIFTALL
308    LASTGIFT
309     TIMELAG
310     AVGGIFT
311    CONTROLN
315     CLUSTER
320      DOB_YR
Name: Column_name, dtype: object

0        TARGET_B
20        POP90C4
21        POP90C5
22           ETH1
45          CHIL1
46          CHIL2
47          CHIL3
55         CHILC1
56         CHILC2
57         CHILC3
58         CHILC4
59         CHILC5
69          MARR1
70          MARR2
71          MARR3
72          MARR4
77            DW3
78            DW4
79            DW5
81            DW7
82            DW8
83            DW9
97           HHD5
98           HHD6
105         ETHC1
106         ETHC2
107         ETHC3
169          TPE3
170          TPE4
171          TPE5
172          TPE6
289          HC15
314        RFA_2F
320     ODATEW_MM
331      STATE_CA
333      STATE_GA
335      STATE_IN
341      STATE_WI
343    HOMEOWNR_H
344    HOMEOWNR_U
349      RFA_2A_D
350      RFA_2A_E
351      RFA_2A_F
352      RFA_2A_G
356    GEOCODE2_D
357    DOMAIN_A_C
358    DOMAIN_A_R
359    DOMAIN_A_S
360    DOMAIN_A_T
361    DOMAIN_A_U
Name: Column_name, dtype: object

1           AGE
6       MALEVET
7      VIETVETS
8      WWIIVETS
13       POP901
14       POP902
15       POP903
44        CHIL1
72         HHP1
73         HHP2
83          HV1
84          HV2
105       ETHC2
117        HUR2
131         RP3
134         ADI
135         DMA
136         IC1
137         IC2
138         IC3
139         IC4
140         IC5
153        IC18
161       HHAS3
166        TPE1
176        PEC2
178       TPE11
180       TPE13
184        LFC4
185        LFC5
186        LFC6
187        LFC7
207        EIC4
224       OEDC5
230         EC4
248         VC3
266       POBC2
273        VOC3
286        HC13
299    CARDPROM
300     NUMPROM
302    NUMPRM12
303    RAMNTALL
304    NGIFTALL
308    LASTGIFT
309     TIMELAG
310     AVGGIFT
311    CONTROLN
315     CLUSTER
320      DOB_YR
Name: Column_name, dtype: object

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X[cols_to_keep], y, test_size=0.3, random_state=42)

model = RandomForestClassifier(random_state=42)
cross_val_score(model, X_train, y_train, cv=10)

array([0.94879473, 0.94879473, 0.94864501, 0.94879473, 0.94879473,
       0.94864501, 0.94879473, 0.94879473, 0.94878706, 0.94848757])

In [24]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9500768585802124

In [25]:
y_pred = model.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27200
           1       0.00      0.00      0.00      1424

    accuracy                           0.95     28624
   macro avg       0.48      0.50      0.49     28624
weighted avg       0.90      0.95      0.93     28624



# SMOTE

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X[cols_to_keep], y, test_size=0.3, random_state=42)

In [28]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_sm, y_sm = smote.fit_resample(np.array(X_train), y_train)
y_sm.value_counts()

0    63369
1    63369
Name: TARGET_B, dtype: int64

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_sm, y_sm)
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))