In [7]:
import re
import graphviz
import math

import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score,roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report
from scipy import stats
from scipy.stats import ks_2samp
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
sns.set_theme(style="ticks", color_codes=True)

from sklearn.ensemble import RandomForestClassifier

In [217]:
alloy_persons_data_path = '../data/raw/apps_data.pkl'
alloy_persons_data = pd.read_pickle(alloy_persons_data_path)

In [294]:
def impute_nulls(train, test=pd.DataFrame(), num_impute_type='median'):
    
    """
    Function to find the nulls/NaNs and impute values with corresponding impute methods.
    'train' is dataset is necessary parameter and 'test' dataset is optional parameter
        Parameters:
            train (dataframe): Dataframe dataset
            test (dataframe): Dataframe dataset
            num_impute (string): 'mean' or 'median' impute methiods for numerical columns
            for categorical columns, 'mode' is used

        Returns:
            train (dataframe): Dataframe
            test (dataframe): Dataframe
    """
    train = train
    test = test
    num_impute_type = num_impute_type
    
    if train.shape[0] == 0 and train.shape[0] == 0:
        print("Given train and test datasets are empty")
        return train, test
    
    elif train.shape[0] > 0:
        
        for col in train.columns:
            if train[col].dtype.name in ['int64','float64','object','category']:
                if train[col].dtype.name in ['int64','float64']:
                    if num_impute_type == 'mean':
                        impute_value = train[col].mean()
                    else:
                        impute_value = train[col].median()
                else:
                    impute_value = train[col].mode()[0]

                idx = train.index[train[col].isnull()].tolist()
                idx.extend(train.index[train[col].isna()].tolist())
                idx.extend(train.index[train[col] == ''].tolist())
                idx = list(set(idx))
                tmp = train.filter(items=idx, axis=0)
                tmp[col] = impute_value
                train.update(tmp)

                if test.shape[0] > 0 and set(train.columns)-set(test.columns) == set():
                    idx = test.index[test[col].isnull()].tolist()
                    idx.extend(test.index[test[col].isna()].tolist())
                    idx.extend(test.index[test[col] == ''].tolist())
                    idx = list(set(idx))
                    tmp = test.filter(items=idx, axis=0)
                    tmp[col] = impute_value
                    test.update(tmp)
    
    if train.shape[0] > 0 and test.shape[0] > 0 and set(train.columns)-set(test.columns) == set():
        return train, test
    else:
        return train

There are id columns and also columns with more than 20 unique values will be ignored to encode

In [280]:
def feature_encoding(data=pd.DataFrame(), encode_columns=[], encode_type='onehot', max_unique_values=20):
    """
    Function to encode the categorical variables.
    'data' is necessary parameter and 'encode_columns' & 'encode_type' are optional parameters
        Parameters:
            data (dataframe): Dataframe dataset
            encode_columns (list): List of columns that require encoding
            encode_type (string): 'onehot' or 'label' encoding methiods

        Returns:
            data (dataframe): Transformed dataframe
    """
    data = data
    encode_columns = encode_columns
    encode_type = encode_type
    max_unique_values = max_unique_values

    if data.shape[0] > 0:
        if len(encode_columns) == 0:
            cat_columns = [col for col in data.columns if data[col].dtype.name in ['object','category','bool']]
        else:
            cat_columns = encode_columns
        
        cat_columns = [col for col in cat_columns if data[col].agg(['nunique'])[0] <= max_unique_values]
        rest_columns = list(set(data.columns)-set(cat_columns))

        if encode_type == 'onehot':
            cat_data = pd.get_dummies(data[cat_columns])
            if len(rest_columns) > 0:
                rest_data = data[rest_columns]
                data = pd.concat([rest_data, cat_data], axis=1)
            else:
                data = cat_data
        else:
            data_tmp = pd.DataFrame(columns=cat_columns)
            for col in cat_columns:
                data_tmp[col] = data[col].astype('category').cat.codes

            if len(rest_columns) > 0:
                rest_data = data[rest_columns]
                data = pd.concat([rest_data, data_tmp], axis=1)
            else:
                data = data_tmp
    return data

In [295]:
data = impute_nulls(train=alloy_persons_data)

In [297]:
# cat_columns = alloy_persons_data.columns
data = feature_encoding(data=alloy_persons_data, encode_columns=[], max_unique_values=60, encode_type='label')

In [298]:
data.shape

(116433, 101)

In [299]:
def classification_models(x_train, y_train, params_lr={}, params_svc={'kernel':'linear'}, params_dtc={}, 
                          params_rfc={}, params_xgbc={}, models=[]):
    """
    Function to train the linear, logistic, decision trees.
    'train_data' is necessary parameter and remaining are optional parameters
        Parameters:
            x_train (dataframe): Dataframe dataset
            y_train (dataframe): Dataframe dataset
            params_lr (dict): logistic regression parameters
            params_dtc (dict): decision tree parameters
            params_svc (dict): SVC parameters
            params_rfc (dict): random forest classifier parameters
            params_xgbc (dict): xboost classifier parameters
            models (list): ['lr','svc','dtc','rfc','xgbc']

        Returns:
            lr (object): trained model output
            svc (object): trained model output
            dtc (object): trained model output
            rfc (object): trained model output
            xgbc (object): trained model output
    """
    params_lr = params_lr
    params_svc = params_svc
    params_dtc = params_dtc
    params_rfc = params_rfc
    params_xgbc = params_xgbc
    models = models
    
    lr = ''
    svc = ''
    dtc = '' 
    rfc = '' 
    xgbc = ''
    
    if models == [] or 'lr' in models:
        if params_lr == {}:
            lr = LogisticRegression().fit(x_train, y_train)
        else:
            lr = LogisticRegression(params_lr).fit(x_train, y_train)
    if models == [] or 'svc' in models:
        if params_svc == {}:
            svc = SVC().fit(x_train, y_train)
        else:
            svc = SVC(params_svc).fit(x_train, y_train)
    if models == [] or 'dtc' in models:
        if params_dtc == {}:
            dtc = DecisionTreeClassifier().fit(x_train, y_train)
        else:
            dtc = DecisionTreeClassifier(params_dtc).fit(x_train, y_train)
    if models == [] or 'rfc' in models:
        if params_rfc == {}:
            rfc = RandomForestClassifier().fit(x_train, y_train)
        else:
            rfc = RandomForestClassifier(params_rfc).fit(x_train, y_train)
    if models == [] or 'xgbc' in models:    
        if params_xgbc == {}:
            xgbc = XGBClassifier().fit(x_train, y_train)
        else:
            xgbc = XGBClassifier(params_xgbc).fit(x_train, y_train)
    
    return lr, svc, dtc, rfc, xgbc

In [None]:
def outlier_treatment():
    
    
    