In [1]:
# Imports

# data manipulation
import pandas as pd

# graphs 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('transfusion.data')

In [3]:
df.shape

(748, 5)

In [4]:
df.sample(10)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
206,2,2,500,16,0
137,2,12,3000,98,0
536,2,7,1750,32,1
463,21,1,250,21,0
522,4,13,3250,39,1
446,23,2,500,26,0
135,2,3,750,19,0
315,4,2,500,29,0
514,4,16,4000,38,1
396,21,2,500,21,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [6]:
df.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [7]:
df['whether he/she donated blood in March 2007'].value_counts()

0    570
1    178
Name: whether he/she donated blood in March 2007, dtype: int64

In [8]:
# Desbalanced Data
df['whether he/she donated blood in March 2007'].value_counts() / len(df['whether he/she donated blood in March 2007']) * 100

0    76.203209
1    23.796791
Name: whether he/she donated blood in March 2007, dtype: float64

In [9]:
df.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [10]:
X = df.drop('whether he/she donated blood in March 2007', axis=1)
y = df['whether he/she donated blood in March 2007']

In [11]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
tree = DecisionTreeClassifier()

In [25]:
tree.fit(X_train, y_train)

In [26]:
predictions = tree.predict(X_test)
predictions_train = tree.predict(X_train)

In [27]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [28]:
accuracy_score(y_test, predictions)

0.6711111111111111

In [29]:
accuracy_score(y_train, predictions_train)

0.9369024856596558

In [30]:
confusion_matrix(y_test, predictions)

array([[134,  34],
       [ 40,  17]], dtype=int64)

In [31]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.77      0.80      0.78       168
           1       0.33      0.30      0.31        57

    accuracy                           0.67       225
   macro avg       0.55      0.55      0.55       225
weighted avg       0.66      0.67      0.66       225



In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
model = RandomForestClassifier()

In [34]:
model.fit(X_train, y_train)

In [35]:
pred_random = model.predict(X_test)

In [36]:
accuracy_score(y_test, pred_random)

0.7155555555555555

In [37]:
print(classification_report(y_test, pred_random))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       168
           1       0.43      0.37      0.40        57

    accuracy                           0.72       225
   macro avg       0.61      0.60      0.61       225
weighted avg       0.70      0.72      0.71       225



In [38]:
confusion_matrix(y_test, pred_random)

array([[140,  28],
       [ 36,  21]], dtype=int64)

In [39]:
help(confusion_matrix)

Help on function confusion_matrix in module sklearn.metrics._classification:

confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)
    Compute confusion matrix to evaluate the accuracy of a classification.
    
    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
    is equal to the number of observations known to be in group :math:`i` and
    predicted to be in group :math:`j`.
    
    Thus in binary classification, the count of true negatives is
    :math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
    :math:`C_{1,1}` and false positives is :math:`C_{0,1}`.
    
    Read more in the :ref:`User Guide <confusion_matrix>`.
    
    Parameters
    ----------
    y_true : array-like of shape (n_samples,)
        Ground truth (correct) target values.
    
    y_pred : array-like of shape (n_samples,)
        Estimated targets as returned by a classifier.
    
    labels : array-like of shape (n_classes), default=