# Problem 2:
Use Random forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

## 1. Import libraries

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,  classification_report

from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import SelectFromModel

In [2]:
%matplotlib notebook

## 2. Load data

In [3]:
fraud_df = pd.read_csv('Fraud_check.csv')

In [4]:
fraud_df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


## 3. EDA

### 3.1 Data understanding

In [5]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [6]:
fraud_df['Undergrad'].unique()

array(['NO', 'YES'], dtype=object)

In [7]:
fraud_df['Marital.Status'].unique()

array(['Single', 'Divorced', 'Married'], dtype=object)

In [8]:
fraud_df['Work.Experience'].unique()

array([10, 18, 30, 15, 28,  0,  8,  3, 12,  4, 19,  6, 14, 16, 13, 29, 25,
       26,  7, 27,  5, 21, 23,  1, 22,  2, 11,  9, 24, 17, 20],
      dtype=int64)

In [9]:
fraud_df['Urban'].unique()

array(['YES', 'NO'], dtype=object)

#### 3.1.1 Constructing new column for classification of people who are likely to commit fraud.

In [10]:
fraud_df['Fraud.Risk'] = fraud_df['Taxable.Income'].apply(lambda x:'Risky' if(x<=30000) else 'Good')

In [11]:
fraud_df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Fraud.Risk
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good


In [12]:
fraud_df['Fraud.Risk'].unique()

array(['Good', 'Risky'], dtype=object)

In [13]:
fraud_df['Fraud.Risk'][fraud_df['Fraud.Risk']=='Risky'].count()

124

### 3.2 separating data into features and target

In [14]:
# Extracting column names and sorting them to appropriate categories.
def column_segregator(df, y_name=None):
    """ Returns  three lists of column headers for feature columns, 
    numeric columns and categorical columns
    Input
    ------
    df: Dataframe
    y_name: default None. Name(str) of target column if available
    
    Output
    ------
    features, numeric_cols, cat_cols"""   
    
    cols = df.columns # List of all columns in the input dataframe.
    numeric_cols = [col for col in cols if (df[col].dtypes != 'object') and col != y_name]
    cat_cols = [col for col in cols if (df[col].dtypes == 'object') and col != y_name]
    features = [col for col in cols if col != y_name]
    
    return features, numeric_cols, cat_cols 

In [15]:
def Xy_split(df, y_name=None, y_col=True):
    """Splits the input dataframe into features and target
    input
    -----
    df: Input dataframe
    y_name: default None. Name(str) of target column if available
    y_col: 'True' if y column is present in input dataframe, else 'False'.
    
    output
    ------
    X (features) , y (target) if y colum is present else only X"""
    
    target = y_name
    feature_col,_,_ = column_segregator(df, target)
    if y_col == True:
        # separating features and target.
        X = df.loc[:, feature_col]
        y = df.loc[:, target]
        return X,y
    else:
        X = df.loc[:, feature_col]
        return X

In [16]:
# Column segregation.
features, numeric_cols, cat_cols = column_segregator(fraud_df, y_name='Fraud.Risk')

In [17]:
# Splitting features and target.
X, y = Xy_split(fraud_df, y_name='Fraud.Risk', y_col=True)

In [18]:
X.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [19]:
y.head()

0    Good
1    Good
2    Good
3    Good
4    Good
Name: Fraud.Risk, dtype: object

### 3.3 Summary statistics:
a) Numeric features

In [20]:
X[numeric_cols].describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


b) Categorical features

In [21]:
X[cat_cols].describe()

Unnamed: 0,Undergrad,Marital.Status,Urban
count,600,600,600
unique,2,3,2
top,YES,Single,YES
freq,312,217,302


c) Target

In [22]:
y.describe()

count      600
unique       2
top       Good
freq       476
Name: Fraud.Risk, dtype: object

### 3.4 Visualizations
### 3.4.1 Feature distribution - numeric features:

In [23]:
fig, axes = plt.subplots(1,3, figsize=(10,4))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.histplot(data=X, x=X[numeric_cols[idx]], ax=ax) 

fig.suptitle('Feature Distribution - numeric features', ha='center', fontweight='bold')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### 3.4.2 Feature distribution - categorical features

In [24]:
fig, axes = plt.subplots(1,3, figsize=(10,4))
axes = axes.flatten()

for idx, ax in enumerate(axes):
    sns.countplot(data=X, x=X[cat_cols[idx]], ax=ax) 

fig.suptitle('Feature Distribution - categorical features', ha='center', fontweight='bold')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### 3.4.3 Target distribution

In [25]:
fig, ax = plt.subplots(figsize=(5,4))
sns.countplot(data=y, x=y, ax=ax)
plt.title('Target distribution', ha='center', fontweight='bold')
plt.show()

<IPython.core.display.Javascript object>

## 3.5 Observations:
- 600 records, 6 features. Need to construct target column from 'Taxable.Income'.
    - If 'Taxable.Income' <= 30,000 - > classify as 'Risky'
    - If 'Taxable.Income' >  30,000 - > classify as 'Good'
- mix of categorical and continuous data.
- No null values.
- All columns have correct datatypes, but we need to encode the categorical data.
- Target distribution is imbalanced. Less 'Risky' class compared to good class.
- Both Numeric and categorical features have a somewhat uniform distribution.

## 4. Model building

### 4.1 Constructing a new dataset based on the sale category defined above.

In [26]:
fraud_df1 = fraud_df.copy()

In [27]:
fraud_df1['Fraud.Risk'] = fraud_df1['Fraud.Risk'].apply(lambda x:0 if(x=='Good') else 1)

In [28]:
fraud_df1.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Fraud.Risk
0,NO,Single,68833,50047,10,YES,0
1,YES,Divorced,33700,134075,18,YES,0
2,NO,Married,36925,160205,30,YES,0
3,YES,Single,50190,193264,15,YES,0
4,NO,Married,81002,27533,28,NO,0


#### 4.1.1 Probability of picking a member from each class  (for reference)

In [29]:
baseline_prob = dict.fromkeys(fraud_df['Fraud.Risk'].unique())
num_risk = fraud_df1['Fraud.Risk'][fraud_df1['Fraud.Risk']==1].count()
num_good = fraud_df1['Fraud.Risk'][fraud_df1['Fraud.Risk']==0].count()
num_records = fraud_df1['Fraud.Risk'].count()

In [30]:
baseline_prob['Good'] = [num_good, round(num_good/num_records, 2)]
baseline_prob['Risky'] = [num_risk, round(num_risk/num_records, 2)]

In [31]:
print('Table 4.1: Probability of finding members from each class')
pd.DataFrame(baseline_prob.values(), index=baseline_prob.keys(), columns=['members', 'probability'])

Table 4.1: Probability of finding members from each class


Unnamed: 0,members,probability
Good,476,0.79
Risky,124,0.21


In [32]:
features, numeric_cols, cat_cols= column_segregator(fraud_df1, y_name='Fraud.Risk')

In [33]:
X_1, y_1 = Xy_split(fraud_df1, y_name='Fraud.Risk', y_col=True)

### 4.2 Constructing a baseline model with cross validation

In [34]:
# Processing categorical variables
cat_transformer = Pipeline(steps=[
     ('enc', OrdinalEncoder())
])

In [35]:
preprocessor = ColumnTransformer(transformers=[
     ('cat_trf', cat_transformer, cat_cols),    
 ], remainder='passthrough')# remainder='passthrough' passes the rest of the dataframe unchanged.

In [36]:
rf_classifier = RandomForestClassifier(random_state=42)

In [37]:
# Decision tree classifier.
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

In [38]:
# K fold cross validation(5 folds).
crv_scores_acc = cross_val_score(estimator=clf, X=X_1, y=y_1, cv=5, scoring='accuracy')
crv_scores_prec = cross_val_score(estimator=clf, X=X_1, y=y_1, cv=5, scoring='precision')
crv_scores_rec = cross_val_score(estimator=clf, X=X_1, y=y_1, cv=5, scoring='recall')

In [39]:
# Summarizing scores:
print("mean accuracy  :{:.4f} ".format(crv_scores_acc.mean()))
print("mean precision :{:.4f} ".format(crv_scores_prec.mean()))
print("mean recall    :{:.4f} ".format(crv_scores_rec.mean()))

mean accuracy  :0.9983 
mean precision :0.9923 
mean recall    :1.0000 


**Note:** 
- Although, the accuracy, precision and recall scores are very high, it may be that the model is overfitting since there are more 0 class compared to 1. This can be seen from the table 4.1 as well, where there are only 21% who belong to the 1 class.
- We need to oversample the minority and check the results again


### 4.3 simple train test split model for checking the classification

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_1,y_1, test_size=0.2, random_state=42, stratify=y_1)

In [41]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat_trf',
                                                  Pipeline(steps=[('enc',
                                                                   OrdinalEncoder())]),
                                                  ['Undergrad',
                                                   'Marital.Status',
                                                   'Urban'])])),
                ('classifier', RandomForestClassifier(random_state=42))])

#### 4.3.1 Model testing

In [42]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [43]:
def display_results(y_test, y_pred):
    """Displays model evaluation/performance report that includes
    accuracy_score, confusion_matrix, precision_score, and 
    recall_score.
    input
    -----
    y_test, y_pred
    
    output
    ------
    Model evaluation/performance report"""
    
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

#### 4.3.2 Model evaluation

In [44]:
# On train data:
display_results(y_train, y_pred_train)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       381
           1       1.00      1.00      1.00        99

    accuracy                           1.00       480
   macro avg       1.00      1.00      1.00       480
weighted avg       1.00      1.00      1.00       480

Confusion matrix:
 [[381   0]
 [  0  99]]


In [45]:
# On test data:
display_results(y_test, y_pred_test)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        95
           1       0.93      1.00      0.96        25

    accuracy                           0.98       120
   macro avg       0.96      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120

Confusion matrix:
 [[93  2]
 [ 0 25]]


### 4.4 Oversampling using  SMOTE and verifying the predictions

In [46]:
fraud_df2 = fraud_df1.copy()

In [47]:
X_2, y_2 = Xy_split(fraud_df2, y_name='Fraud.Risk', y_col=True)

In [48]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, test_size=0.2, stratify=y_2, random_state=42)

In [49]:
X_train2.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
517,NO,Single,19272,195078,26,NO
273,YES,Divorced,68404,58942,20,NO
170,NO,Divorced,98588,143402,10,NO
501,NO,Divorced,56628,55842,4,YES
295,YES,Married,52912,40089,10,NO


In [50]:
X_train2 = pd.DataFrame(preprocessor.fit_transform(X_train2), index=X_train2.index, columns=cat_cols + numeric_cols)

In [51]:
X_train2.head()

Unnamed: 0,Undergrad,Marital.Status,Urban,Taxable.Income,City.Population,Work.Experience
517,0.0,2.0,0.0,19272.0,195078.0,26.0
273,1.0,0.0,0.0,68404.0,58942.0,20.0
170,0.0,0.0,0.0,98588.0,143402.0,10.0
501,0.0,0.0,1.0,56628.0,55842.0,4.0
295,1.0,1.0,0.0,52912.0,40089.0,10.0


In [52]:
y_train2.head()

517    1
273    0
170    0
501    0
295    0
Name: Fraud.Risk, dtype: int64

In [53]:
oversample = SMOTE()

In [54]:
X_train2os, y_train2os = oversample.fit_resample(X_train2,y_train2)

In [55]:
# Distribution of target labels.
fig, ax = plt.subplots()
sns.countplot(x=y_train2os) #since on the x axis , we have y labels.
ax.set_xlabel('Fraud risk')
ax.set_ylabel('count')
ax.set_title('Output distribution after oversampling - Train data')
plt.show()

<IPython.core.display.Javascript object>

In [56]:
X_test2 = pd.DataFrame(preprocessor.transform(X_test2), index=X_test2.index, columns=cat_cols + numeric_cols)

In [57]:
X_test2.head()

Unnamed: 0,Undergrad,Marital.Status,Urban,Taxable.Income,City.Population,Work.Experience
447,0.0,1.0,1.0,83061.0,166606.0,5.0
474,1.0,0.0,0.0,99128.0,80729.0,0.0
462,0.0,0.0,0.0,16690.0,149327.0,17.0
406,0.0,1.0,0.0,54345.0,107688.0,2.0
526,0.0,0.0,0.0,26494.0,82565.0,12.0


In [58]:
# Distribution of target labels.
fig, ax = plt.subplots()
sns.countplot(x=y_test2) #since on the x axis , we have y labels.
ax.set_xlabel('Fraud risk')
ax.set_ylabel('count')
ax.set_title('Test data distribution(y_test2)')
plt.show()

<IPython.core.display.Javascript object>

In [59]:
clf_os = clf

In [60]:
clf_os.fit(X_train2os, y_train2os)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat_trf',
                                                  Pipeline(steps=[('enc',
                                                                   OrdinalEncoder())]),
                                                  ['Undergrad',
                                                   'Marital.Status',
                                                   'Urban'])])),
                ('classifier', RandomForestClassifier(random_state=42))])

#### 4.4.1 Model testing

In [61]:
y_tr_pred = clf_os.predict(X_train2os)
y_ts_pred = clf_os.predict(X_test2)

#### 4.4.2 Model evaluation

In [62]:
# On train data (after oversampling):
display_results(y_train2os, y_tr_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       381
           1       1.00      1.00      1.00       381

    accuracy                           1.00       762
   macro avg       1.00      1.00      1.00       762
weighted avg       1.00      1.00      1.00       762

Confusion matrix:
 [[381   0]
 [  0 381]]


In [63]:
# On test data (passing to the model made from oversampled data):
display_results(y_test2, y_ts_pred)

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        95
           1       0.89      1.00      0.94        25

    accuracy                           0.97       120
   macro avg       0.95      0.98      0.96       120
weighted avg       0.98      0.97      0.98       120

Confusion matrix:
 [[92  3]
 [ 0 25]]


### 4.5 Observations
- Our case, **False positive**( predicting 'Good' as 'risk') is **OK** ,but **false negative** (predicting 'Risk' as 'Good') **not OK**. 
- This model is performing well in predicting both 'Good' and 'Risk' classes.
- Despite the high accuracy, the real test of the model is when it is made to predict the classes from a yet unseen data. Both models developed above (in sections 4.4 and 4.5) are performing equally well, hence we can choose either to solve the problem.

## 5. Conclusion
A decision tree model was applied to a fraud dataset and an attempt was made to infer if a person would commit a fraud. It was found that decision tree is identifying the classes very well when both train and test data are passed.