references: 
- http://cs229.stanford.edu/proj2013/ShiraniMehr-SMSSpamDetectionUsingMachineLearningApproach.pdf
- https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/

In [1]:
import pandas as pd
import collections
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
import math

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../dataset.csv')
print(df.shape)
df.head()

(4611, 2)


Unnamed: 0,Teks,label
0,Gimana dek...dah baikan blm...,0.0
1,Ikuti Seminar Inspiratif Cara Mudah Sukses Bis...,1.0
2,"Anda Terpilih\nSbgi pemenang\nCEK. Rp.100 ,jt\...",1.0
3,Punya masalah keuangan?\ncukup jaminkan Bpkb M...,1.0
4,DISK0N T0GEL hingga 65% hanya di J0KERBET888 b...,1.0


### Data Cleaning

In [3]:
df = df.dropna()
print(df.shape)

(4605, 2)


In [4]:
df['label'] = df['label'].astype('int64')
df.dtypes

Teks     object
label     int64
dtype: object

### Feature Extraction

In [5]:
len_values = []

for i in range(0, len(df)):
    len_values.append(len(df.iloc[i].Teks))
    
len_col = pd.Series(len_values)
df['length'] = len_col.values

In [6]:
df_iklan = df[df['label'] == 1]
df_non_iklan = df[df['label'] == 0]

print("Iklan shape", df_iklan.shape)
print("Non Iklan shape", df_non_iklan.shape)

Iklan shape (2459, 3)
Non Iklan shape (2146, 3)


In [7]:
cap_list = []

for i in range(0,len(df)):
    words = df.Teks.iloc[i].split()
    count = 0
    for j in range(0,len(words)):
        if(words[j].isupper()):
            count = count + 1;
    cap_list.append(count)    
    #print(cap_list)
        
cap_col = pd.Series(cap_list)
df['CAP'] = cap_col.values

In [8]:
digits_list = []

for i in range(0,len(df)):
    if(sum(c.isdigit() for c in df.Teks.iloc[i]) == 0):
        digits_list.append(0)
    else:
        digits_list.append(sum(c.isdigit() for c in df.Teks.iloc[i]))

digits_col = pd.Series(digits_list)
df['DIGITS'] = digits_col.values

In [9]:
df.head()

Unnamed: 0,Teks,label,length,CAP,DIGITS
0,Gimana dek...dah baikan blm...,0,30,0,0
1,Ikuti Seminar Inspiratif Cara Mudah Sukses Bis...,1,128,5,17
2,"Anda Terpilih\nSbgi pemenang\nCEK. Rp.100 ,jt\...",1,142,4,12
3,Punya masalah keuangan?\ncukup jaminkan Bpkb M...,1,149,1,25
4,DISK0N T0GEL hingga 65% hanya di J0KERBET888 b...,1,159,3,10


## Training & Test
### Utility function

In [10]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


def removearray(L,arr):
    ind = 0
    size = len(L)
    while ind != size and not np.array_equal(L[ind],arr):
        ind += 1
    if ind != size:
        L.pop(ind)
    else:
        raise ValueError('array not found in list.')


# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

### Split function

In [11]:
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right


# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}


# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)


# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

### Building Tree

In [12]:
# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root


# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']


# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
    tree = build_tree(train, max_depth, min_size)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    return(predictions)

In [13]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        removearray(train_set,fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [14]:
X = df[['DIGITS','length','CAP']].copy()
y = df[['label']].copy()
X.head()

Unnamed: 0,DIGITS,length,CAP
0,0,30,0
1,17,128,5
2,12,142,4
3,25,149,1
4,10,159,3


In [15]:
dataset = np.hstack((X, y))
dataset.shape

(4605, 4)

In [16]:
from random import seed
from random import randrange
import time

seed(1)
start = time.time()

# evaluate algorithm
n_folds = 5
max_depth = 5
min_size = 10
scores = evaluate_algorithm(dataset, decision_tree, n_folds, max_depth, min_size)

end = time.time()
print("Execution time: ", end - start, " second")
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Execution time:  300.2542769908905  second
Scores: [82.95331161780673, 85.66775244299674, 83.49619978284474, 83.17046688382193, 85.23344191096635]
Mean Accuracy: 84.104%
