In [25]:
import pandas as pd

uri = 'https://s3.amazonaws.com/caelum-online-public/356-intro-machine-learning/busca.csv'

data = pd.read_csv(uri, sep =',')

data.rename({'busca': 'search', 'logado': 'logged', 'comprou': 'bought'}, axis='columns', inplace=True)

In [26]:
X = data[['home', 'search', 'logged']]
Y = data['bought']


X = pd.get_dummies(X)

In [56]:
# With Multinomial naive bayes algorithm

from sklearn.naive_bayes import MultinomialNB
from collections import Counter

train_percentage = 0.9
train_size = int(train_percentage * len(data))

train_X = X[:train_size]
train_Y = Y[:train_size]

test_size = int(len(data) - train_size)

test_X = X[-test_size:]
test_Y = Y[-test_size:]

model = MultinomialNB()
model.fit(train_X, train_Y)

result = model.predict(test_X)
differences = result == test_Y

hits = sum(differences.values)
total_elements = len(test_X)
accuracy = 100.0 * hits / total_elements

baseline = (100 * max(Counter(test_Y).values()) / len(test_Y))

print(accuracy)
print(total_elements)
print(f'Baseline: {baseline}')

82.0
100
Baseline: 82.0


In [57]:
# With Adaboost algorithm

from sklearn.ensemble import AdaBoostClassifier
from collections import Counter

train_percentage = 0.9
train_size = int(train_percentage * len(data))

train_X = X[:train_size]
train_Y = Y[:train_size]

test_size = int(len(data) - train_size)

test_X = X[-test_size:]
test_Y = Y[-test_size:]

model = AdaBoostClassifier()
model.fit(train_X, train_Y)

result = model.predict(test_X)
differences = result == test_Y

hits = sum(differences.values)
total_elements = len(test_X)
accuracy = 100.0 * hits / total_elements

baseline = (100 * max(Counter(test_Y).values()) / len(test_Y))

print(accuracy)
print(total_elements)
print(f'Baseline: {baseline}')

85.0
100
Baseline: 82.0


In [81]:
# Creating function to reuse code easily

def fit_and_predict(name, model, train_X, train_Y, test_X, test_Y):
    model.fit(train_X, train_Y)

    result = model.predict(test_X)
    differences = result == test_Y

    hits = sum(differences.values)
    total_elements = len(test_X)
    accuracy = 100.0 * hits / total_elements

    baseline = (100 * max(Counter(test_Y).values()) / len(test_Y))

    print(f'Name: {name} : Accuracy of {accuracy} | Total elements: {total_elements}  Baseline: {baseline}')    
    return accuracy

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from collections import Counter

X = data[['home', 'search', 'logged']]
Y = data['bought']

X = pd.get_dummies(X)

train_percentage = 0.8
test_percentage = 0.1

train_size = int(train_percentage * len(Y))
test_size = int(test_percentage * len(Y))
validation_size = len(Y) - train_size - test_size

train_X = X[0:train_size]
train_Y = Y[0:train_size]

end_test = train_size + test_size

test_X = X[train_size:end_test]
test_Y = Y[train_size:end_test]

validation_X = X[end_test:]
validation_Y = Y[end_test:]

model_multinomial_nb = MultinomialNB()
model_adaBoost = AdaBoostClassifier()

result_multinomial = fit_and_predict('MultinomialNB', model_multinomial_nb, train_X, train_Y, test_X, test_Y)

result_adaboost = fit_and_predict('AdaBoost', model_adaBoost, train_X, train_Y, test_X, test_Y)


if result_multinomial > result_adaboost:
    winner = model_multinomial_nb
else:
    winner = model_adaBoost

result = winner.predict(validation_X)
accuracy = (result - validation_Y)

hits = sum(differences.values)
total_elements = len(test_X)
accuracy = 100.0 * hits / total_elements


print(f'Accuracy of winner model with validation data: {accuracy}')

Name: MultinomialNB : Accuracy of 82.0 | Total elements: 100  Baseline: 82.0
Name: AdaBoost : Accuracy of 84.0 | Total elements: 100  Baseline: 82.0
Accuracy of winner model with validation data: 85.0
