In [1]:
%load_ext blackcellmagic

## Covariate shift
Having train and test datasets with a different distribution can lead to a poor performance of an algorithm. One of the types of dataset shift is called covariate shift. It consists on having a different distribution between the test/train datasets of a specific variable. 
<br> Here I want to evaluate the variables of winequality.csv with respect to covariate shift. To do so, I want to split the dataset using the temporal data colected (not using random split) in 80% train and 20% test. Each feature is going to be evaluated separately and the two models (Logistic Regression and Gradient Boosting Classifier) are going to classify the examples with respect to the binary target (0, if the example came from the train distribution and 1 if it came from the test ditribution). Features with a higher roc_auc scores can be considered drifting features, influencing the poor performance of the algorithm.
<br> After identifying drifting features, I want to evaluate the influence of the performance of our models when the drifting features are excluded.
<br> Using the general threshold of 0.8 not drifting feature was found. However, I found 0.7 auc score for 'fixed acidity' and decided to remove it from the dataset in order to compare each model's results to the results using the previous dataset. After excluding it, the accuracy, precision and recall of LR were slightly better for splits equal 0.3 to 0.4. For GBC, the accuracy, precision and recall values increased for the splits from 0.05 to 0.4.
### Conclusion
Removing "fixed acidity" increased less than 1% of the model's performance.

In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    confusion_matrix,
    f1_score,
    recall_score,
    precision_score,
)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Ignoring the warnings
import warnings
warnings.filterwarnings("ignore")


# Loading the data
data = pd.read_csv("winequality.csv")
data["recommend"] = data["recommend"].astype(int)

# Dropping 'quality' from data
data = data.drop(["quality"], axis=1)

# Scaling the data
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data.values)
data_scaled = pd.DataFrame(scaled_features, index=data.index, columns=data.columns)
data_scaled["recommend"] = data["recommend"]

# Setting the models to be used
models = []
models.append(("LR", LogisticRegression(solver="lbfgs", random_state=0, max_iter=800)))
models.append(
    (
        "GBC",
        GradientBoostingClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0
        ),
    )
)

# Creating a list of features
columns_list = []
for column in data_scaled:
    columns_list.append(column)

# Creating a list of splits
splits = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

In [3]:
def drifting_variable(data_scaled, columns_list, models, split):
    data_train = data_scaled.loc[0:4418, :]
    data_test = data_scaled.loc[4419:4898, :]

    drifts = []
    models_lista = []
    feature_name = []
    global df
    df = pd.DataFrame()

    for column in columns_list:
        for name, model in models:

            X_train = pd.DataFrame()
            X_test = pd.DataFrame()

            X_train[column] = data_train[column]
            X_test[column] = data_test[column]

            X_train["target"] = 0
            X_test["target"] = 1

            X_merged = pd.concat((X_train, X_test), ignore_index=True).drop(
                "target", axis=1
            )
            y_merged = pd.concat((X_train, X_test), ignore_index=True).drop(
                column, axis=1
            )

            score = cross_val_score(model, X_merged, y_merged, cv=2, scoring="roc_auc")

            drifts.append(np.mean(score))
            models_lista.append(str(name))
            feature_name.append(column)

    df["model"] = models_lista
    df["feature"] = feature_name
    df["score"] = drifts

    return df.groupby(["model", "feature"]).first()

In [4]:
drifting_variable(data_scaled, columns_list, models, 0.25)

Unnamed: 0_level_0,Unnamed: 1_level_0,score
model,feature,Unnamed: 2_level_1
GBC,alcohol,0.534268
GBC,chlorides,0.541684
GBC,citric acid,0.436007
GBC,density,0.542224
GBC,fixed acidity,0.71395
GBC,free sulfur dioxide,0.489027
GBC,pH,0.481151
GBC,recommend,0.529658
GBC,residual sugar,0.541251
GBC,sulphates,0.550507


In [5]:
# Defining X and Y
X = data_scaled.iloc[:, 0:-1]
y = data_scaled.iloc[:, -1]

In [6]:
def model_split(models, X, y, splits):
    model_name = []
    splits_values = []
    accuracies = []
    precision_scores = []
    recall_scores = []
    global df
    df = pd.DataFrame()

    for name, model in models:
        for value in splits:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=value, random_state=0
            )
            model_name.append(name)
            splits_values.append(value)
            current_model = model.fit(X_train, y_train)
            prediction = current_model.predict(X_test)
            accuracy = accuracy_score(y_test, prediction)
            accuracies.append(accuracy)
            precision = precision_score(y_test, prediction, average="binary")
            precision_scores.append(precision)
            recall = recall_score(y_test, prediction, average="binary")
            recall_scores.append(recall)

    df["model_name"] = model_name
    df["splits"] = splits_values
    df["accuracies"] = accuracies
    df["precision_scores"] = precision_scores
    df["recall_scores"] = recall_scores

    return df.groupby(["model_name", "splits"]).first()

In [7]:
model_split(models, X, y, splits)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracies,precision_scores,recall_scores
model_name,splits,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GBC,0.05,0.816327,0.804878,0.471429
GBC,0.1,0.826531,0.802817,0.445312
GBC,0.2,0.853061,0.757143,0.490741
GBC,0.3,0.855782,0.737089,0.501597
GBC,0.4,0.846429,0.711475,0.504651
GBC,0.5,0.846468,0.717172,0.518248
GBC,0.6,0.839741,0.690632,0.490712
GBC,0.7,0.834937,0.695122,0.451187
GBC,0.8,0.819342,0.627826,0.422222
GBC,0.9,0.802223,0.547464,0.445503


In [8]:
X = X.drop(["fixed acidity"], axis=1)

In [9]:
model_split(models, X, y, splits)

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracies,precision_scores,recall_scores
model_name,splits,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GBC,0.05,0.8,0.744186,0.457143
GBC,0.1,0.830612,0.78481,0.484375
GBC,0.2,0.864286,0.778523,0.537037
GBC,0.3,0.857143,0.73516,0.514377
GBC,0.4,0.847959,0.721477,0.5
GBC,0.5,0.846468,0.731183,0.49635
GBC,0.6,0.835318,0.679204,0.475232
GBC,0.7,0.82998,0.678208,0.439314
GBC,0.8,0.8247,0.64,0.449123
GBC,0.9,0.802903,0.549096,0.449735
