In [35]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [3]:
df = pd.read_csv('http://bit.ly/digital_ad_sales_csv')

In [4]:
df.head()

Unnamed: 0,audience_id,search,display,social,sales
0,370202,230100.0,37800.0,69200,22100
1,172422,44500.0,39300.0,45100,10400
2,323144,17200.0,45900.0,69300,9300
3,422335,151500.0,41300.0,58500,18500
4,194985,180800.0,10800.0,58400,12900


In [6]:
df['sales_level'] = pd.qcut(df['sales'], 4, labels=False)

In [7]:
df.head()

Unnamed: 0,audience_id,search,display,social,sales,sales_level
0,370202,230100.0,37800.0,69200,22100,3
1,172422,44500.0,39300.0,45100,10400,1
2,323144,17200.0,45900.0,69300,9300,0
3,422335,151500.0,41300.0,58500,18500,3
4,194985,180800.0,10800.0,58400,12900,1


In [13]:
df.isnull().sum()

audience_id    0
search         1
display        3
social         0
sales          0
sales_level    0
dtype: int64

In [15]:
mean_search = df['search'].mean()
mean_search

146788.44221105528

In [16]:
df['search'] = df['search'].fillna(mean_search)

In [17]:
mean_display = df['display'].mean()
mean_display

23335.02538071066

In [18]:
df['display'] = df['display'].fillna(mean_display)

In [19]:
df.isnull().sum()

audience_id    0
search         0
display        0
social         0
sales          0
sales_level    0
dtype: int64

# Decision Tree

In [20]:
X = df[['search','display','social']]
y = df['sales_level']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [29]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [30]:
predictions = model.predict(X_test)
predictions

array([2, 3, 3, 0, 3, 2, 3, 0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 2, 1, 3, 3, 2,
       1, 3, 0, 0, 3, 1, 0, 0, 1, 1, 3, 0, 2, 3, 2, 3, 1, 0])

In [31]:
metrics.accuracy_score(y_test, predictions)

0.875

In [33]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.89      0.67      0.76        12
           2       0.57      0.80      0.67         5
           3       1.00      1.00      1.00        11

   micro avg       0.88      0.88      0.88        40
   macro avg       0.85      0.87      0.85        40
weighted avg       0.89      0.88      0.87        40



precision: 
 - Ability of a classification model to return only relevant instances
 - When it predicts 1, how often is it correct?
 - True Positives / (True Positives + False Positives)


recall aka sensitivity: 
- Ability of a classification model to identify all relevant instances  
- When it's actually 1, how often does it predict 1?
- True Positives / (True Positives + False Negatives)


f1-score: 
- Balance between precision and recall  
- 2 * (Precision * Recall / Precision + Recall)


In [34]:
pd.crosstab(y_test, predictions, rownames = ['Actual'], colnames = ['Predicted'], margins = True)

Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,12,0,0,0,12
1,1,8,3,0,12
2,0,1,4,0,5
3,0,0,0,11,11
All,13,9,7,11,40


# Random Forest

In [49]:
X = df[['search','display','social']]
y = df['sales_level']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [51]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [52]:
predictions = model.predict(X_test)
predictions

array([3, 3, 3, 0, 3, 1, 3, 0, 1, 2, 0, 0, 1, 0, 1, 1, 0, 2, 1, 2, 3, 1,
       1, 3, 0, 0, 3, 2, 0, 0, 1, 1, 3, 0, 1, 3, 0, 3, 2, 0])

In [53]:
metrics.accuracy_score(y_test, predictions)

0.825

In [33]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       0.89      0.67      0.76        12
           2       0.57      0.80      0.67         5
           3       1.00      1.00      1.00        11

   micro avg       0.88      0.88      0.88        40
   macro avg       0.85      0.87      0.85        40
weighted avg       0.89      0.88      0.87        40



precision: 
 - Ability of a classification model to return only relevant instances
 - When it predicts 1, how often is it correct?
 - True Positives / (True Positives + False Positives)


recall aka sensitivity: 
- Ability of a classification model to identify all relevant instances  
- When it's actually 1, how often does it predict 1?
- True Positives / (True Positives + False Negatives)


f1-score: 
- Balance between precision and recall  
- 2 * (Precision * Recall / Precision + Recall)


In [34]:
pd.crosstab(y_test, predictions, rownames = ['Actual'], colnames = ['Predicted'], margins = True)

Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,12,0,0,0,12
1,1,8,3,0,12
2,0,1,4,0,5
3,0,0,0,11,11
All,13,9,7,11,40


In [54]:
model.feature_importances_

array([0.44037906, 0.35691508, 0.20270585])

In [56]:
pd.DataFrame(model.feature_importances_, index = X.columns, 
             columns = ['Importance']).sort_values(['Importance'], 
            ascending = False)

Unnamed: 0,Importance
search,0.440379
display,0.356915
social,0.202706


In [57]:
model.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1608637542, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1273642419, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_we