In [1]:
# Data manipulation & vizualisation tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [10]:
cat = pd.read_csv('data/categorical.csv')
num = pd.read_csv('data/numerical.csv')
target = pd.read_csv('data/target.csv')

In [33]:
data = pd.concat([cat, num, target], axis=1)

In [34]:
data.select_dtypes(object).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   STATE     95412 non-null  object
 1   HOMEOWNR  95412 non-null  object
 2   GENDER    95412 non-null  object
 3   RFA_2R    95412 non-null  object
 4   RFA_2A    95412 non-null  object
 5   GEOCODE2  95412 non-null  object
 6   DOMAIN_A  95412 non-null  object
dtypes: object(7)
memory usage: 5.1+ MB


In [35]:
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

# Imbalance management - Downsampling

In [36]:
category_0 = data[data['TARGET_B'] == 0].sample(len(data[data['TARGET_B'] == 1]))

In [37]:
category_1 = data[data['TARGET_B'] == 1]

In [40]:
data = pd.concat([category_0, category_1], axis=0).reset_index(drop=True)

In [42]:
data['TARGET_B'].value_counts()

0    4843
1    4843
Name: TARGET_B, dtype: int64

In [43]:
# set dependent variable
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis=1)

In [45]:
# split
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)

In [46]:
X_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9686 entries, 0 to 9685
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   STATE     9686 non-null   object
 1   HOMEOWNR  9686 non-null   object
 2   GENDER    9686 non-null   object
 3   RFA_2R    9686 non-null   object
 4   RFA_2A    9686 non-null   object
 5   GEOCODE2  9686 non-null   object
 6   DOMAIN_A  9686 non-null   object
dtypes: object(7)
memory usage: 529.8+ KB


In [47]:
from sklearn.preprocessing import OneHotEncoder

In [66]:
encoder = OneHotEncoder(drop='first').fit(X_cat)
X_cat_encoded = encoder.transform(X_cat).toarray()

In [65]:
X_cat_en_df.head() # lost categorical feature names

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
X_cat_encoded_cols = encoder.get_feature_names_out(X_cat.columns) # recovering names

In [68]:
X_cat_en_df = pd.DataFrame(X_cat_encoded, columns=X_cat_encoded_cols)

In [70]:
X_full = pd.concat([X_num, X_cat_en_df], axis=1)

In [71]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_full, y, test_size=0.3, random_state=42)

In [72]:
from sklearn.ensemble import RandomForestClassifier

In [73]:
clf = RandomForestClassifier(random_state=42, max_depth=2, n_estimators=200)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.9817618719889883


In [74]:
X_full.shape

(9686, 355)

# With ensemble you should use more than one scoring metric

In [75]:
from sklearn.model_selection import cross_val_score

In [76]:
cross_val_scores = cross_val_score(clf, X_train, y_train)

In [77]:
cross_val_scores
# The closer this values are together, the more accurate and stable is

array([0.98451327, 0.9719764 , 0.97345133, 0.97492625, 0.98893805])

## Feature selection / scoring with RF

In [79]:
feature_importances = clf.feature_importances_

In [80]:
feature_names = X_full.columns

In [81]:
forest_importances = pd.Series(feature_importances, index=feature_names)

In [86]:
forest_importances.sort_values(ascending=False).head(20)

TARGET_D        0.164242
RFA_2F          0.074248
AVGGIFT         0.060382
MAXRAMNT        0.060336
CARDGIFT        0.046050
LASTGIFT        0.035944
MINRAMNT        0.033707
LASTDATE_YR     0.032809
NGIFTALL        0.030828
RFA_2A_E        0.029035
CONTROLN        0.018004
MAXRDATE_YR     0.017184
RFA_2A_G        0.015751
LASTDATE_MM     0.015032
ODATEW_YR       0.014653
RAMNTALL        0.013997
CARDPROM        0.013918
HVP6            0.010990
CARDPM12        0.010465
FIRSTDATE_YR    0.010020
dtype: float64