### Build function to reuse

In [1]:
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

In [2]:
def build_model(classifier_fn,                
                features, 
                label, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[features]
    Y = dataset[label]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print("Features used: ", features)
    #Evaluation model     
    evaluate_classification(y_test, y_pred)
    
    return {'model':model, 
            'x_train': x_train, 
            'y_train': y_train, 
            'x_test': x_test, 
            'y_test': y_test, 
            'y_pred': y_pred}

In [3]:
# Evaluation model
def evaluate_classification(y_test, y_pred):

    report = classification_report(y_test, y_pred)

    print('Classification report')
    print("------" * 10)
    print(report)

#### Apply to models

In [4]:
# Logistic Regression
def logistic_fn(x_train, y_train, penalty='l2', C=1.0, max_iter=1000):
    
    model = LogisticRegression(penalty=penalty, C=C,
                               max_iter=max_iter , solver='lbfgs')
    
    model.fit(x_train, y_train)
    
    return model

In [5]:
# DecisionTree Classifier
def decision_tree_fn(x_train, y_train, max_depth=3): 
    
    model = DecisionTreeClassifier(max_depth=max_depth)
    model.fit(x_train, y_train)
    
    return model

In [6]:
def main():

    data = pd.read_csv('./datasets/advertising_cleaned.csv')
    features = ['TimeSpent', 'Age',
                'AreaIncome','DailyInternetUsage', 
                'Male']

    try:
        model_type = sys.argv[1]
        
        if len(sys.argv) > 2:
            features = sys.argv[2:]
        
    except error:
        print("Classifier model not specified!") 
        
        
    print("Running classifier: ", model_type)

    if model_type == "logistic_regression":
        build_model(logistic_fn,
                    features,
                    'Clicked',
                    data)
    elif model_type == "decision_tree":
        build_model(decision_tree_fn,
                    features,
                    'Clicked',
                    data)
    else:
        print("Invalid classifier model")   

In [7]:
if __name__ == "__main__":
    main()
    

Running classifier:  -f
Invalid classifier model


#### remove this when you download .py file

In [8]:
data = pd.read_csv('./datasets/advertising_cleaned.csv')
data.head()

Unnamed: 0,TimeSpent,Age,AreaIncome,DailyInternetUsage,AdHeadline,City,Male,Country,Timestamp,Clicked
0,80.23,31.0,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
1,69.47,26.0,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
2,74.15,29.0,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
3,68.37,35.0,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0
4,59.99,23.0,59761.56,226.74,Sharable client-driven software,Jamieberg,1,Norway,2016-05-19 14:30:17,0


In [9]:
build_model(logistic_fn,
            ['TimeSpent', 'Age',
             'AreaIncome','DailyInternetUsage', 
             'Male'],
            'Clicked',
            data)

Features used:  ['TimeSpent', 'Age', 'AreaIncome', 'DailyInternetUsage', 'Male']
Classification report
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.96      0.97      0.96        98
           1       0.97      0.96      0.97       103

    accuracy                           0.97       201
   macro avg       0.97      0.97      0.97       201
weighted avg       0.97      0.97      0.97       201



{'model': LogisticRegression(max_iter=1000),
 'x_train':      TimeSpent   Age  AreaIncome  DailyInternetUsage  Male
 558      78.64  31.0    60283.47              235.28     1
 507      80.30  58.0    49090.51              173.43     0
 116      61.88  42.0    60223.52              112.19     1
 488      80.29  31.0    49457.48              244.87     1
 298      65.80  25.0    60843.32              231.49     1
 ..         ...   ...         ...                 ...   ...
 88       56.89  37.0    37334.78              109.29     1
 569      50.32  40.0    27964.60              125.65     0
 220      75.83  27.0    67516.07              200.59     0
 773      32.60  45.0    48206.04              185.47     0
 517      59.22  55.0    39131.53              126.39     1
 
 [801 rows x 5 columns],
 'y_train': 558    0
 507    1
 116    1
 488    0
 298    0
       ..
 88     1
 569    1
 220    0
 773    1
 517    1
 Name: Clicked, Length: 801, dtype: int64,
 'x_test':      TimeSpent   Age  

In [10]:
build_model(decision_tree_fn,
           ['TimeSpent', 'Age',
            'AreaIncome','DailyInternetUsage', 
            'Male'],
           'Clicked',
           data)

Features used:  ['TimeSpent', 'Age', 'AreaIncome', 'DailyInternetUsage', 'Male']
Classification report
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        98
           1       0.93      0.97      0.95       103

    accuracy                           0.95       201
   macro avg       0.95      0.95      0.95       201
weighted avg       0.95      0.95      0.95       201



{'model': DecisionTreeClassifier(max_depth=3),
 'x_train':      TimeSpent   Age  AreaIncome  DailyInternetUsage  Male
 424      60.07  42.0    65963.37              120.75     1
 925      59.64  51.0    71455.62              153.12     1
 105      72.23  25.0    46557.92              241.03     1
 194      71.86  32.0    51633.34              116.53     0
 907      37.01  50.0    48826.14              216.01     0
 ..         ...   ...         ...                 ...   ...
 819      79.60  39.0    73392.28              194.23     0
 488      80.29  31.0    49457.48              244.87     1
 544      42.60  55.0    55121.65              168.29     0
 786      59.96  33.0    77143.61              197.66     0
 830      42.51  30.0    54755.71              144.77     1
 
 [801 rows x 5 columns],
 'y_train': 424    1
 925    1
 105    0
 194    1
 907    1
       ..
 819    0
 488    0
 544    1
 786    1
 830    1
 Name: Clicked, Length: 801, dtype: int64,
 'x_test':      TimeSpent   Age