# sklearn을 이용한 의사결정 트리



In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV


# 데이터셋 설명
인터넷 광고 데이터셋으로 3279개의 이미지가 들어있습니다. 459개는 광고이고 2820개는 광고가 아닌 이미지 입니다.

Data Set Information:

This dataset represents a set of possible advertisements on Internet pages. The features encode the geometry of the image (if available) as well as phrases occuring in the URL, the image's URL and alt text, the anchor text, and words occuring near the anchor text. The task is to predict whether an image is an advertisement ("ad") or not ("nonad").


4. This dataset represents a set of possible advertisements on
   Internet pages.  The features encode the geometry of the image (if
   available) as well as phrases occuring in the URL, the image's URL and
   alt text, the anchor text, and words occuring near the anchor text.
   The task is to predict whether an image is an advertisement ("ad") or
   not ("nonad").

5. Number of Instances: 3279 (2821 nonads, 458 ads)

6. Number of Attributes: 1558 (3 continous; others binary; this is the
   "STANDARD encoding" mentioned in the [Kushmerick, 99].)
   One or more of the three continous features are missing in 28%
   of the instances; missing values should be interpreted as "unknown".


In [9]:
df = pd.read_csv('./data/ad.data', header=None, low_memory=False)
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
3274,170,94,0.5529,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
3275,101,140,1.3861,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
3276,23,120,5.2173,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
3277,?,?,?,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.
3278,40,40,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,nonad.


In [16]:
explanatory_variable_columns = set(df.columns.values)
explanatory_variable_columns.remove(len(df.columns.values)-1)
response_variable_column = df[len(df.columns.values)-1]

y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)].copy()
X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(criterion='entropy'))
])
parameters = {
    'clf__max_depth': (150, 200, 300),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2, 3)
}

In [17]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  46 out of  54 | elapsed:    3.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    3.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('clf', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__max_depth': (150, 200, 300), 'clf__min_samples_split': (2, 3), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=1)

In [20]:
best_parameters = grid_search.best_estimator_.get_params()
print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name, best_parameters[param_name]))

Best score: 0.877
Best parameters set:
tclf__max_depth: 300
tclf__min_samples_leaf: 1
tclf__min_samples_split: 3


In [19]:
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       724
          1       0.89      0.86      0.88        96

avg / total       0.97      0.97      0.97       820

