In [1]:
import pandas as pd
import numpy as np
import dill
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.model_selection import KFold, RandomizedSearchCV

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

import xgboost as xgb

from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix
import itertools
import warnings
warnings.simplefilter("ignore")

In [2]:
data = pd.read_csv('winequality-red.csv')

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
data.rename(columns=
            {'free sulfur dioxide':'free_sulfur_dioxide',
             'total sulfur dioxide':'total_sulfur_dioxide'},
            inplace=True)

In [5]:
features = ['sulphates', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'pH']

In [6]:
bins = (2, 6.5, 8)
group_names = [0, 1]
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)

In [7]:
data['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [8]:
target = data['quality']
data = data.drop('quality', axis = 1)

In [9]:
numerical_features = data.select_dtypes(include=[np.number])
print(f"count of numeric_features {numerical_features.shape[1]}")

numerical_features = numerical_features.columns.tolist()

count of numeric_features 11


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data[features], target, random_state=21)

X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [11]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [12]:
numerical_transformers = []
for num_feature in features:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=num_feature)),
                ('standard', StandardScaler())
        ])
    numerical_transformers.append((num_feature, transfomer))



In [13]:
feats = FeatureUnion(numerical_transformers)
feature_processing = Pipeline([('feats', feats)])

feature_processing.fit_transform(X_train)

array([[-0.3407383 ,  0.47913753,  0.27070792,  1.3594117 ],
       [-0.51225183,  0.76120917,  2.99440463, -0.96436346],
       [-0.74093653, -0.64914901, -0.26218927, -1.02891277],
       ...,
       [-0.62659418,  0.66718529,  2.87598303, -0.25432105],
       [-0.22639595,  3.01778226,  1.27729148,  0.45572136],
       [-0.28356712, -0.83719677, -0.73587565,  0.97211584]])

In [14]:
params = {
    "booster": "gblinear",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 1000,
    "reg_lambda": 100,
    "nthread": 6,
    "seed": 27
}

classifier = Pipeline([
    ('features',feats),
    ('classifier', xgb.XGBClassifier(**params)),
])

classifier.fit(X_train, y_train)
y_score = classifier.predict_proba(X_test)[:, 1]

In [15]:
params = {'classifier__n_estimators':[50, 100, 200, 500, 700, 1000, 1200, 1500],
          'classifier__max_depth':[3, 5, 7]}

In [16]:
cv=KFold(n_splits=3, random_state=21, shuffle=True)

In [17]:
rs = RandomizedSearchCV(classifier, params, scoring='roc_auc', cv=cv, n_jobs=-1)

In [18]:
rs = rs.fit(X_train, y_train)

Parameters: { max_depth } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [19]:
rs.best_params_

{'classifier__n_estimators': 50, 'classifier__max_depth': 3}

In [20]:
rs.best_score_

0.7269237359744523

In [21]:
classifier = Pipeline([
    ('features',feats),
    ('classifier', xgb.XGBClassifier(max_depth=7, n_estimators=50, random_state=42, reg_alpha=0.1, reg_lambda=1e-3)),
])

In [22]:
classifier.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('sulphates',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  NumberSelector(key='sulphates')),
                                                                 ('standard',
                                                                  StandardScaler(copy=True,
                                                                                 with_mean=True,
                                                                                 with_std=True))],
                                                          verbose=False)),
                                                ('free_sulfur_dioxide',
                                                 Pipeline(memory=None,
           

In [23]:
with open("xgb_pipeline.dill", "wb") as f:
    dill.dump(classifier, f)