In [1]:
from multiprocessing import Pool, cpu_count
import gc; gc.enable()

import pandas as pd
import numpy as np
from sklearn import *
from datetime import datetime
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

<h2>Feature Engineering

<h3>Der Adaptive Domain Labeling Ansatz setzt ab hier auf. Dateien werden als train.csv und test.csv in SQLite importiert und manipuliert und anschließend in folgenden Dateien bereitgestellt

In [4]:
# Einlesen der Feature Engineerten Daten

## Fertige Feature export Datei #1 einlesen
wsdm_dataset = pd.read_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\export_wsdm.csv")

## Labels einlesen
wsdm_dataset_labels = pd.read_csv(r"D:\Work_Masterarbeit\wsdm\train_v2.csv")

In [5]:
# Merge die Features der Trainigsdaten
wsdm_dataset = pd.merge(wsdm_dataset, wsdm_dataset_labels, on='msno', how='inner')

In [6]:
#Spalten an Anfang verschieben train
first_column = wsdm_dataset.pop('msno')
second_column = wsdm_dataset.pop('is_churn')
wsdm_dataset.insert(0, 'msno', first_column)
wsdm_dataset.insert(1, 'is_churn', second_column)

In [2]:
# reference: https://medium.com/@vincentteyssier/optimizing-the-size-of-a-pandas-dataframe-for-low-memory-environment-5f07db3d72e
# reference: https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html
# reference: https://numpy.org/doc/stable/reference/generated/numpy.finfo.html

def datatype_changer(dataset):
    # iterating through all the columns in the dataframe
    for col in dataset.columns:
        # getting column's datatype
        col_type = dataset[col].dtype
        
        # checking if datatype of column is 'object' or not
        # if column type is not object
        if (col_type == int) or (col_type == float):
            # getting minimum value of a column
            min_val = dataset[col].min()
            # getting maximum value of a column
            max_val = dataset[col].max()
            # checking whether the datatype contain first 3 characters as int or not, if int
            if str(col_type)[:3] == 'int':
                # cheking the minimal and maximal value for int8, int16, int32 and int64 in numpy
                if min_val > np.iinfo(np.int8).min and max_val < np.iinfo(np.int8).max:
                    dataset[col] = dataset[col].astype(np.int8)
                elif min_val > np.iinfo(np.int16).min and max_val < np.iinfo(np.int16).max:
                    dataset[col] = dataset[col].astype(np.int16)
                elif min_val > np.iinfo(np.int32).min and max_val < np.iinfo(np.int32).max:
                    dataset[col] = dataset[col].astype(np.int32)
                else:
                    dataset[col] = dataset[col].astype(np.int64)
            else:
                # if it is non int, which is ultimately float
                # cheking the minimal and maximal value for float16, float32 and float64 in numpy
                if min_val > np.finfo(np.float16).min and max_val < np.finfo(np.float16).max:
                    dataset[col] = dataset[col].astype(np.float16)
                elif min_val > np.finfo(np.float32).min and max_val < np.finfo(np.float32).max:
                    dataset[col] = dataset[col].astype(np.float32)
                else:
                    dataset[col] = dataset[col].astype(np.float64)
        else:
            # keeping rest of them to category datatype instead of object
            dataset[col] = dataset[col].astype('category')
            
    # returning head of the dataframe
    return dataset

In [8]:
#Doppelte Features entfernen
# wsdm_dataset = wsdm_dataset.loc[:,~wsdm_dataset.T.duplicated(keep='first')]

In [9]:
#Spalten mit Nullvalues entfernen
wsdm_dataset = wsdm_dataset.drop(['DiffMAXdateMAXdate'], axis=1, inplace=False)

In [10]:
## Datentypen korrigieren
wsdm_dataset = datatype_changer(wsdm_dataset)

In [11]:
wsdm_dataset.isnull().sum()

msno                    0
is_churn                0
AVGregistered_via       0
AVGpayment_plan_days    0
AVGgender               0
                       ..
MAXnum_unqstock         0
MAXnum_25stock          0
MAXnum_100stock         0
MAXtotal_secsstock      0
msno.1                  0
Length: 142, dtype: int64

In [12]:
# saving wsdm file
wsdm_dataset.to_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\wsdm_FA_adaptive_dl.csv", index=False)

<h1>3. Prepare Data for Modelling

In [3]:
# loading wsdm dataset
wsdm_dataset = pd.read_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\wsdm_FA_adaptive_dl.csv")

In [4]:
# saving actual train dataset
wsdm_dataset.to_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_train.csv", index=False)

<h2> 3.2 Train Test Split

In [5]:
# loading actual wsdm data
wsdm_data = pd.read_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_train.csv")

In [6]:
# Load RAW Data
wsdm_dataset_raw = pd.read_csv(r"D:\Work_Masterarbeit\wsdm\preprocessed_2\train.csv")

In [7]:
# Merging with RAW Data
wsdm_dataset = pd.merge(wsdm_dataset_raw, wsdm_data, on='msno', how='left')

In [8]:
wsdm_dataset.shape

(16887877, 164)

In [9]:
## Sample 50% of Data due to memory restrictions
wsdm_data = wsdm_dataset.sample(frac=.5)

In [10]:
#Typkonvertierung
wsdm_data['transaction_date'] = wsdm_data['transaction_date'].str.replace('-', '').astype(float)
wsdm_data['date'] = wsdm_data['date'].str.replace('-', '').astype(float)
wsdm_data['registration_init_time'] = wsdm_data['registration_init_time'].str.replace('-', '').astype(float)
wsdm_data['membership_expire_date'] = wsdm_data['membership_expire_date'].str.replace('-', '').astype(float)

In [11]:
#Entfernen unnötiger Spalten
wsdm_data = wsdm_data.drop(['msno.1'], axis=1, inplace=False)

In [12]:
# https://stackoverflow.com/a/29651514
def normalize(df):
    result1 = df.copy()
    for feature_name in df.columns:
        if (str(feature_name) != str('msno') and str(feature_name)!=str('is_churn')):
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result1

wsdm_data = normalize(wsdm_data)

In [13]:
# checking for null values
wsdm_data.isnull().values.any()

True

In [14]:
wsdm_data = datatype_changer(wsdm_data)

In [15]:
# getting columns and converting them to list
cols = wsdm_data.columns.tolist()
# saving them for further use
np.savez_compressed(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\columns_incraw.npz", cols)

In [16]:
# replacing all nan and infinity to 0
wsdm_data = wsdm_data.replace(np.nan, 0, inplace=False)
wsdm_data = wsdm_data.replace(np.inf, 0, inplace=False)

In [17]:
wsdm_data.head()

Unnamed: 0,msno,is_churn_x,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,...,MAXcitystock,MAXbdstock,MAXpayment_method_idstock,MAXis_auto_renewstock,MAXplan_list_pricestock,MAXactual_amount_paidstock,MAXnum_unqstock,MAXnum_25stock,MAXnum_100stock,MAXtotal_secsstock
4130337,4J7qpOlq4Ph73NKRa8A6467pI8kZL/yS7b4P/9lLe94=,0.0,0.227295,0.528809,1.0,0.692383,0.234497,0.975586,1.0,0.827637,...,0.227295,0.528809,0.975586,0.5,0.827637,0.827637,0.671875,0.466553,0.864746,0.883301
10276491,TXmElz+ayCjss0TB3Rkx7t9jng1aDlKA1MREpdbIE4I=,1.0,0.59082,0.399902,1.0,0.230713,0.691406,0.951172,1.0,0.827637,...,0.59082,0.399902,0.951172,0.5,0.827637,0.827637,0.955078,1.0,0.905273,0.916992
16861633,u7srW/4bbLDuKgVTyZcNuS0dsvUVvAxyk8D876AyFUI=,0.0,0.045441,0.385742,0.0,0.538574,0.925293,1.0,1.0,0.549805,...,0.045441,0.385742,1.0,0.5,0.549805,0.549805,0.582031,0.733398,0.5,0.508789
4403731,Tcp+Hw+sddFv3KUqrIWgUrCCILehpM4OK1vVK/L9w7s=,0.0,0.045441,0.385742,0.0,0.538574,0.849121,1.0,1.0,0.549805,...,0.045441,0.385742,1.0,0.5,0.549805,0.549805,0.910645,0.333252,0.878418,0.791016
11842115,kJJKLqnz+Exi/diWLSSGlBZn5Q/p2HY0cvynl+WfKlM=,0.0,0.045441,0.385742,0.0,0.538574,0.695801,1.0,1.0,0.827637,...,0.045441,0.385742,1.0,0.5,0.827637,0.827637,0.746094,0.466553,0.5,0.472656


In [18]:
# getting labels for y
labels = wsdm_data['is_churn_x'].values
labels

array([0., 1., 0., ..., 0., 0., 0.], dtype=float16)

In [19]:
# getting labels for X
data = wsdm_data.drop(['msno', 'is_churn_x','is_churn_y'], axis=1, inplace=False)
data

Unnamed: 0,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,...,MAXcitystock,MAXbdstock,MAXpayment_method_idstock,MAXis_auto_renewstock,MAXplan_list_pricestock,MAXactual_amount_paidstock,MAXnum_unqstock,MAXnum_25stock,MAXnum_100stock,MAXtotal_secsstock
4130337,0.227295,0.528809,1.0,0.692383,0.234497,0.975586,1.0,0.827637,0.827637,0.5,...,0.227295,0.528809,0.975586,0.5,0.827637,0.827637,0.671875,0.466553,0.864746,0.883301
10276491,0.590820,0.399902,1.0,0.230713,0.691406,0.951172,1.0,0.827637,0.827637,0.5,...,0.590820,0.399902,0.951172,0.5,0.827637,0.827637,0.955078,1.000000,0.905273,0.916992
16861633,0.045441,0.385742,0.0,0.538574,0.925293,1.000000,1.0,0.549805,0.549805,0.5,...,0.045441,0.385742,1.000000,0.5,0.549805,0.549805,0.582031,0.733398,0.500000,0.508789
4403731,0.045441,0.385742,0.0,0.538574,0.849121,1.000000,1.0,0.549805,0.549805,0.5,...,0.045441,0.385742,1.000000,0.5,0.549805,0.549805,0.910645,0.333252,0.878418,0.791016
11842115,0.045441,0.385742,0.0,0.538574,0.695801,1.000000,1.0,0.827637,0.827637,0.5,...,0.045441,0.385742,1.000000,0.5,0.827637,0.827637,0.746094,0.466553,0.500000,0.472656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2533911,0.227295,0.428467,1.0,0.692383,0.077698,0.951172,1.0,0.827637,0.827637,0.5,...,0.227295,0.428467,0.951172,0.5,0.827637,0.827637,0.984863,1.000000,0.892090,0.906738
1383048,0.772949,0.300049,1.0,0.538574,0.852051,1.000000,1.0,0.549805,0.549805,0.5,...,0.772949,0.300049,1.000000,0.5,0.549805,0.549805,0.656738,1.000000,0.716309,0.679199
15743692,0.772949,0.357178,0.5,0.692383,0.077515,0.975586,1.0,0.827637,0.827637,0.5,...,0.772949,0.357178,0.975586,0.5,0.827637,0.827637,0.806152,0.866699,0.945801,0.905762
15183433,0.227295,0.242798,0.5,0.230713,0.846680,0.926758,1.0,0.827637,0.827637,0.0,...,0.227295,0.242798,0.926758,0.0,0.827637,0.827637,0.850586,0.533203,0.526855,0.645508


<h2> Daten aufteilen in 70% Training und 30% Test

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5910756, 160) (5910756,)
(2533182, 160) (2533182,)


In [21]:
# converting X_train to numpy array
wsdm_arr = X_train.values
wsdm_arr

array([[0.22729492, 0.24279785, 1.        , ..., 0.46655273, 0.91894531,
        0.97753906],
       [0.68164062, 0.30004883, 0.5       , ..., 0.86669922, 0.98632812,
        0.96826172],
       [0.04544067, 0.38574219, 0.        , ..., 0.60009766, 0.47290039,
        0.45605469],
       ...,
       [0.1817627 , 0.30004883, 0.5       , ..., 0.73339844, 0.72949219,
        0.78759766],
       [0.27270508, 0.35717773, 0.5       , ..., 0.39990234, 0.44604492,
        0.51611328],
       [0.22729492, 0.57128906, 1.        , ..., 0.86669922, 0.72949219,
        0.7734375 ]])

In [22]:
wsdm_arr.shape

(5910756, 160)

In [23]:
# saving it for further use
np.savez_compressed(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_train_incraw.npz", wsdm_arr)

In [24]:
# displaying y_train
wsdm_labels_arr = y_train
wsdm_labels_arr

array([0., 0., 0., ..., 0., 0., 0.], dtype=float16)

In [25]:
wsdm_labels_arr.shape

(5910756,)

In [26]:
# saving it for further use
np.savez_compressed(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_wsdm_labels_incraw.npz", wsdm_labels_arr)

In [27]:
# converting X_test to numpy array
X_arr = X_test.values
X_arr

array([[0.04544067, 0.38574219, 0.        , ..., 0.86669922, 0.59472656,
        0.59521484],
       [0.04544067, 0.38574219, 0.        , ..., 0.39990234, 0.09460449,
        0.11645508],
       [0.1817627 , 0.25708008, 0.5       , ..., 0.66650391, 0.58105469,
        0.64550781],
       ...,
       [0.59082031, 0.45703125, 1.        , ..., 1.        , 0.56738281,
        0.52148438],
       [0.77294922, 0.32861328, 1.        , ..., 0.66650391, 0.32421875,
        0.35253906],
       [0.59082031, 0.64306641, 0.5       , ..., 0.79980469, 0.43237305,
        0.3918457 ]])

In [28]:
# checking shape of test array
X_arr.shape

(2533182, 160)

In [29]:
# saving it for further use
np.savez_compressed(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_test_incraw.npz", X_arr)

In [30]:
# displaying y_test
X_labels_arr = y_test
X_labels_arr

array([0., 0., 0., ..., 0., 0., 0.], dtype=float16)

In [31]:
# checking shape of test labels array
X_labels_arr.shape

(2533182,)

In [32]:
# saving it for further use
np.savez_compressed(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_test_labels_incraw.npz", X_labels_arr)

<h1> 5. Modelling

In [41]:
# loading library
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
#import seaborn as sns
#from prettytable import PrettyTable

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgb
#import xgboost as xgb
from sklearn import metrics
#import catboost as cb


In [42]:
X_train = np.load(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_train_incraw.npz")
y_train = np.load(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_wsdm_labels_incraw.npz")
X_test = np.load(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_test_incraw.npz")
y_test = np.load(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\actual_test_labels_incraw.npz")

In [43]:
X_train = X_train['arr_0']
X_test = X_test['arr_0']
y_train = y_train['arr_0']
y_test = y_test['arr_0']

# getting shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5910756, 160) (5910756,)
(2533182, 160) (2533182,)


In [44]:
# loading columns
cols = np.load(r"D:\Work_Masterarbeit\wsdm\preprocessed_4\columns_incraw.npz")
# converting numpy array to python list
cols = cols['arr_0'].tolist()
# removing first two cols 'msno' and 'is_churn'
cols.remove('msno')
cols.remove('is_churn_x')

<h1> Logistic Regression

In [None]:
%%time

# fitting logistic regression model with best parameter values
lr_model = LogisticRegression(penalty='l2', tol=0.0001, C=10, solver='newton-cg', class_weight='balanced', n_jobs=-1)
lr_model.fit(X_train, y_train)

In [None]:
# Reference: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/#:~:text=Saving%20Your%20Model-,Save%20Your%20Model%20with%20pickle,it%20to%20make%20new%20predictions.
# saving the trained logistic regression ML model
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_lr.sav"
joblib.dump(lr_model, filename)

In [None]:
# loading the best model from disk
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_lr.sav"
loaded_model = joblib.load(filename)

In [None]:
## Logloss
# predicting probabilities for X_cv
pred_test = loaded_model.predict_proba(X_test)
# getting probabilities corresponding to class label 1 only
pred_test = pred_test[:,1]
# calculating log loss
print('Test log-loss: ', log_loss(y_test, pred_test))

<h3>Model Evaluation

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

lr_predict=lr_model.predict(X_test)
lr_predict


print('Accuracy:', accuracy_score(y_test, lr_predict))
print('F1 score weighted:', f1_score(y_test, lr_predict,average='weighted'))
print('F1 score macro:', f1_score(y_test, lr_predict, average='macro'))
print('Recall:', recall_score(y_test, lr_predict,average='weighted'))
print('Precision:', precision_score(y_test, lr_predict,average='weighted'))
print('\n clasification report:\n', classification_report(y_test, lr_predict))
print('\n confussion matrix:\n',confusion_matrix(y_test, lr_predict))

<h2> Decision Tree

In [None]:
%%time

# training a decision tree classifier model with best parameters
dtc_model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=15, min_samples_split=2, min_samples_leaf=1)
dtc_model.fit(X_train, y_train)

In [None]:
# https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/#:~:text=Saving%20Your%20Model-,Save%20Your%20Model%20with%20pickle,it%20to%20make%20new%20predictions.
# saving the decision tree classifier model
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_dtc.sav"
joblib.dump(dtc_model, filename)

In [None]:
# loading the best decision tree classifier model from disk
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_dtc.sav"
loaded_model = joblib.load(filename)

In [None]:
## Logloss
# predicting probabilities for X_cv
pred_test = loaded_model.predict_proba(X_test)
# getting probabilities corresponding to class label 1 only
pred_test = pred_test[:,1]
# calculating log loss
print('Test log-loss: ', log_loss(y_test, pred_test))

<h3> Model Evaluation

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

dtc_modelpred=dtc_model.predict(X_test)
dtc_modelpred


print('Accuracy:', accuracy_score(y_test, dtc_modelpred))
print('F1 score weighted:', f1_score(y_test, dtc_modelpred,average='weighted'))
print('F1 score macro:', f1_score(y_test, dtc_modelpred, average='macro'))
print('Recall:', recall_score(y_test, dtc_modelpred,average='weighted'))
print('Precision:', precision_score(y_test, dtc_modelpred,average='weighted'))
print('\n clasification report:\n', classification_report(y_test, dtc_modelpred))
print('\n confussion matrix:\n',confusion_matrix(y_test, dtc_modelpred))

<h2>Randomforest

In [None]:
%%time

# training the random forest classifier for best hyper parameters
rf_model = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=15, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', bootstrap=True, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

rf_modelpred=rf_model.predict(X_test)
rf_modelpred


print('Accuracy:', accuracy_score(y_test, rf_modelpred))
print('F1 score weighted:', f1_score(y_test, rf_modelpred,average='weighted'))
print('F1 score macro:', f1_score(y_test, rf_modelpred, average='macro'))
print('Recall:', recall_score(y_test, rf_modelpred,average='weighted'))
print('Precision:', precision_score(y_test, rf_modelpred,average='weighted'))
print('\n clasification report:\n', classification_report(y_test, rf_modelpred))
print('\n confussion matrix:\n',confusion_matrix(y_test, rf_modelpred))

In [None]:
# https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/#:~:text=Saving%20Your%20Model-,Save%20Your%20Model%20with%20pickle,it%20to%20make%20new%20predictions.
# saving the decision tree classifier model
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_rf.sav"
joblib.dump(rf_model, filename)

In [None]:
# loading the best decision tree classifier model from disk
filename = r"D:\Work_Masterarbeit\wsdm\preprocessed_4\finalized_model_rf.sav"
loaded_model = joblib.load(filename)

In [None]:
## Logloss
# predicting probabilities for X_cv
pred_test = loaded_model.predict_proba(X_test)
# getting probabilities corresponding to class label 1 only
pred_test = pred_test[:,1]
# calculating log loss
print('Test log-loss: ', log_loss(y_test, pred_test))