## Clasificación avanzada

#### SVM

El siguiente ejemplo fue tomado de la documentación oficial de scikit-learn 

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', 
                                  categories=categories, 
                                  shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, 
                                 shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
X_train_tf = TfidfVectorizer(use_idf=False).fit_transform(twenty_train.data)
X_train_tf.shape


(2257, 35788)

In [3]:
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SVC(kernel='linear'))])

In [4]:
import numpy as np
text_clf.fit(twenty_train.data, twenty_train.target)  
predicted = text_clf.predict(twenty_test.data)
# Calculamos accuracy:
np.mean(predicted == twenty_test.target)

0.9207723035952063

In [5]:
tf_idf = TfidfVectorizer()
X_train = tf_idf.fit_transform(twenty_train.data)
X_test = tf_idf.transform(twenty_test.data)
y_train = twenty_train.target
y_test = twenty_test.target

#### Voting

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

svc_clf = SVC(kernel='linear', probability=True) #  para soft
sgd_clf = SGDClassifier(loss='log') #  para soft
voting_clf = VotingClassifier(
estimators=[('svc', svc_clf), ('sgd', sgd_clf)],voting='soft')


for clf in (svc_clf, sgd_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

SVC 0.9207723035952063
SGDClassifier 0.9154460719041279
VotingClassifier 0.9274300932090546


#### Bagging

In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [9]:
accuracy_score(y_test, y_pred)

0.7190412782956058

In [10]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=200, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
accuracy_score(y_test, y_pred)

0.8029294274300932

#### RandomForest

In [12]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8129161118508655

#### Boosting

#### AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
     DecisionTreeClassifier(max_depth=1), n_estimators=1500,
     algorithm="SAMME.R"
 )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=1500, random_state=None)

In [14]:
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7723035952063915

#### XgBoost

In [19]:
# pip install xgboost

from xgboost import XGBClassifier

In [20]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [21]:
y_pred = xgb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8681757656458056

#### LightGBM

In [23]:
# conda install -c conda-forge lightgbm
import lightgbm as lgb

In [24]:
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)])

[1]	valid_0's multi_logloss: 1.27964
[2]	valid_0's multi_logloss: 1.19555
[3]	valid_0's multi_logloss: 1.12404
[4]	valid_0's multi_logloss: 1.06169
[5]	valid_0's multi_logloss: 1.00366
[6]	valid_0's multi_logloss: 0.952655
[7]	valid_0's multi_logloss: 0.909478
[8]	valid_0's multi_logloss: 0.869212
[9]	valid_0's multi_logloss: 0.8343
[10]	valid_0's multi_logloss: 0.80072
[11]	valid_0's multi_logloss: 0.771599
[12]	valid_0's multi_logloss: 0.743307
[13]	valid_0's multi_logloss: 0.71649
[14]	valid_0's multi_logloss: 0.691686
[15]	valid_0's multi_logloss: 0.667471
[16]	valid_0's multi_logloss: 0.64493
[17]	valid_0's multi_logloss: 0.625319
[18]	valid_0's multi_logloss: 0.607054
[19]	valid_0's multi_logloss: 0.589882
[20]	valid_0's multi_logloss: 0.573306
[21]	valid_0's multi_logloss: 0.557896
[22]	valid_0's multi_logloss: 0.545648
[23]	valid_0's multi_logloss: 0.531141
[24]	valid_0's multi_logloss: 0.518963
[25]	valid_0's multi_logloss: 0.50745
[26]	valid_0's multi_logloss: 0.495493
[27]	v

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [25]:
y_pred = gbm.predict(X_test)
accuracy_score(y_test, y_pred)

0.8808255659121171

#### Stacking

In [28]:
# pip install vecstack

In [29]:
from vecstack import StackingTransformer


estimators = [('xgb', xgb_clf),
              ('ada', ada_clf)]
              
# StackingTransformer
stack = StackingTransformer(estimators, regression=False, verbose=2)

# Fit
stack = stack.fit(X_train, y_train)

# stacked features
S_train = stack.transform(X_train)
S_test = stack.transform(X_test)


task:         [classification]
n_classes:    [4]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [2]

estimator  0: [xgb: XGBClassifier]
    fold  0:  [0.90619469]
    fold  1:  [0.92730496]
    fold  2:  [0.90780142]
    fold  3:  [0.90070922]
    ----
    MEAN:     [0.91050257] + [0.01005092]

estimator  1: [ada: AdaBoostClassifier]
    fold  0:  [0.86017699]
    fold  1:  [0.84219858]
    fold  2:  [0.79609929]
    fold  3:  [0.85460993]
    ----
    MEAN:     [0.83827120] + [0.02520274]

Train set was detected.
Transforming...

estimator  0: [xgb: XGBClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [ada: AdaBoostClassifier]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

Transforming...

estimator  0: [xgb: XGBClassifier]
    model from fold  0: done
    model from fold  

In [30]:
gbm = lgb.LGBMClassifier()
gbm.fit(S_train, y_train,
        eval_set=[(S_test, y_test)])

[1]	valid_0's multi_logloss: 1.24098
[2]	valid_0's multi_logloss: 1.12795
[3]	valid_0's multi_logloss: 1.03503
[4]	valid_0's multi_logloss: 0.957364
[5]	valid_0's multi_logloss: 0.891688
[6]	valid_0's multi_logloss: 0.835695
[7]	valid_0's multi_logloss: 0.787682
[8]	valid_0's multi_logloss: 0.746343
[9]	valid_0's multi_logloss: 0.710652
[10]	valid_0's multi_logloss: 0.679854
[11]	valid_0's multi_logloss: 0.65313
[12]	valid_0's multi_logloss: 0.629988
[13]	valid_0's multi_logloss: 0.609343
[14]	valid_0's multi_logloss: 0.591476
[15]	valid_0's multi_logloss: 0.576034
[16]	valid_0's multi_logloss: 0.562712
[17]	valid_0's multi_logloss: 0.551245
[18]	valid_0's multi_logloss: 0.541315
[19]	valid_0's multi_logloss: 0.532812
[20]	valid_0's multi_logloss: 0.52556
[21]	valid_0's multi_logloss: 0.519414
[22]	valid_0's multi_logloss: 0.514224
[23]	valid_0's multi_logloss: 0.509793
[24]	valid_0's multi_logloss: 0.506097
[25]	valid_0's multi_logloss: 0.503039
[26]	valid_0's multi_logloss: 0.500537


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [32]:
y_pred = gbm.predict(S_test)
accuracy_score(y_test, y_pred)

0.8601864181091877