# Imports

In [1]:
import pandas as pd
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

sys.path.append('../src/')
from utils import name_category_features

# Loading data

In [2]:
df = pd.read_csv('../data/processed/train.csv')

## Prepare data

In [3]:
df[name_category_features()] = df[name_category_features()].astype(object)

## Splitting data

In [4]:
X = df.drop('Revenue', axis=1)
y = df['Revenue'].ravel()

X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.3,
            random_state=42,
            stratify=df[['to_split']],
        )
X_train.drop('to_split', axis=1, inplace=True)
X_test.drop('to_split', axis=1, inplace=True)

# Pipeline

In [5]:
mm = MinMaxScaler()
rare_enc = RareLabelEncoder(n_categories = 5,variables=['Month',
                                       'OperatingSystems',	'Browser',	'Region',	'TrafficType'])
onehot_enc = OneHotEncoder(variables=['Month','OperatingSystems',	'Browser',	'Region',	'TrafficType',	'VisitorType'	,'Weekend'])

minmax_scaler = SklearnTransformerWrapper(transformer=mm, variables=['Administrative',
'Administrative_Duration',
'Informational',
'Informational_Duration',
'ProductRelated'	,
'ProductRelated_Duration',
'BounceRates'	,
'ExitRates',
'PageValues'	,
'SpecialDay'])

prep_pipe = Pipeline([
    ('rare_enc', rare_enc),
     ('onehot', onehot_enc),
     ('minmax', minmax_scaler)
])

# Fitting model

In [6]:
X_train_transformed = prep_pipe.fit_transform(X_train, y_train)

In [7]:
X_test_transformed = prep_pipe.transform(X_test)

In [54]:
model = LogisticRegression(max_iter=1000)

# Evaluate model with cross validation

In [55]:
scores = cross_val_score(model, X_train_transformed, y_train, cv= 10)

In [56]:
scores.mean()

0.871048929998358

# Final pipeline

In [57]:
pipe = Pipeline([
    ('rare_enc', rare_enc),
     ('onehot', onehot_enc),
     ('minmax', minmax_scaler),
     ('lr', model),
])
pipe.fit(X_train, y_train)

In [58]:
y_train_predict = pipe.predict(X_train)
y_test_predict = pipe.predict(X_test)

In [62]:
print(classification_report(y_train, y_train_predict))

              precision    recall  f1-score   support

       False       0.88      0.99      0.93      5107
        True       0.78      0.24      0.37       934

    accuracy                           0.87      6041
   macro avg       0.83      0.62      0.65      6041
weighted avg       0.86      0.87      0.84      6041



In [63]:
print(classification_report(y_test, y_test_predict))

              precision    recall  f1-score   support

       False       0.87      0.98      0.93      2189
        True       0.73      0.22      0.34       401

    accuracy                           0.87      2590
   macro avg       0.80      0.60      0.63      2590
weighted avg       0.85      0.87      0.83      2590

