
![BTS](https://github.com/vfp1/bts-dsf-2020/raw/main/Logo-BTS.jpg)

# 3rd_Assigment-C

### Lenin Escobar <lenin.escobar@bts.tech> - Data Driven Business (2020-12-21)

Open this notebook in Github: [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lveagithub/bts-ddb-2020/blob/master/3rdDatascienceFinance/code/3rd_Assigment-C_Helper.ipynb)

### Import libraries

In [1]:
#General purposes
import numpy as np
import pandas as pd
import time
from contextlib import contextmanager
import gc
import warnings
# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from imblearn.over_sampling import SMOTE, ADASYN
#Missing values
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
#Training
from sklearn.model_selection import train_test_split
#Models
from sklearn.linear_model import LogisticRegression
#Metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import jaccard_score
import itertools
#Options
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

<h1 style="background-color:powderblue;">General helper Classes</h1>

In [None]:
class CleaningHelper():
    """Cleaning Helper"""
    def __init__(self, version):
        pd.options.mode.chained_assignment = None  # default='warn'
        self.version = version
    
    def __str__(self):
        return f"Cleaning helper version {self.version}"
    
    @contextmanager
    def timer(title):
        t0 = time.time()
        yield
        print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
    def get_nulls_data(self, df_):
        #We want to know the quality of data. 
        #So, let's start by detecting not null percentage related to every column. 

        df_tot_nulls = df_.isnull().sum().sort_values(ascending=False)
        df_tot_nulls_perc = 100 - round(df_tot_nulls/len(df_)*100,2)
        df_tot_perc_nulls = pd.concat([df_tot_nulls,df_tot_nulls_perc],axis=1)
        df_tot_perc_nulls = df_tot_perc_nulls.rename(columns={0: "Total", 1: "PercNotNull"})
        return df_tot_perc_nulls
    
    def load_training_data(self):
        # Train Dataset
        df_app_train = pd.read_csv('../home-credit-default-risk/dataset/application_train.csv')
        print('Testing data shape: ', df_app_train.shape)
        return df_app_train
    
    def load_testing_data(self):
        # Test Dataset
        df_app_test = pd.read_csv('../home-credit-default-risk/dataset/application_test.csv')
        print('Testing data shape: ', df_app_test.shape)
        return df_app_test
    
    def label_encoding(self,app_train,app_test):
        """Process encoded dataframes
        Code taken from: 
        https://github.com/LZhemin/home-credit-default-risk/blob/master/3_PreProcessed_Models/PreProcessed_Models.ipynb
        """
        # Create a label encoder object
        le = LabelEncoder()
        le_count = 0

        # Iterate through the columns
        for col in app_train:
            if app_train[col].dtype == 'object':
                # If 2 or fewer unique categories
                if len(list(app_train[col].unique())) <= 2:
                    # Train on the training data
                    le.fit(app_train[col])
                    # Transform both training and testing data
                    app_train[col] = le.transform(app_train[col])
                    app_test[col] = le.transform(app_test[col])

                    # Keep track of how many columns were label encoded
                    le_count += 1

        print('%d columns were label encoded in test and train dataframes.' % le_count)

    def one_hot_encoding(self,app_train,app_test):
        """Process one-hot-encoded dataframes
        Code taken from: 
        https://github.com/LZhemin/home-credit-default-risk/blob/master/3_PreProcessed_Models/PreProcessed_Models.ipynb
        """
        # one-hot encoding of categorical variables
        app_train_dummies = pd.get_dummies(app_train)
        app_test_dummies = pd.get_dummies(app_test)

        print('Training Features shape: ', app_train_dummies.shape)
        print('Testing Features shape: ', app_test_dummies.shape)
        return app_train_dummies, app_test_dummies
    
    def align_train_test(self,app_train,app_test):
        """Process one-hot-encoded dataframes
        Code taken from: 
        https://github.com/LZhemin/home-credit-default-risk/blob/master/3_PreProcessed_Models/PreProcessed_Models.ipynb
        """
        train_labels = app_train['TARGET']

        # Align the training and testing data, keep only columns present in both dataframes
        app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

        # Add the target back in
        app_train['TARGET'] = train_labels

        print('Training Features shape: ', app_train.shape)
        print('Testing Features shape: ', app_test.shape)
        return app_train, app_test

    def missing_value_treatment(self,app_train,app_test):
        # Drop the target from the training data
        #print(app_train.shape)
        #print(app_test.shape)
        trainY = app_train['TARGET']

        if 'TARGET' in app_train:
            trainX = app_train.drop(columns = ['TARGET'])
        else:
            trainX = app_train.copy()

        # Feature names
        features = list(trainX.columns)

        # Copy of the testing data
        #test = app_test.copy()

        # Median imputation of missing values
        imputer = SimpleImputer(strategy = 'median')

        # Scale each feature to 0-1
        scaler = MinMaxScaler(feature_range = (0, 1))

        # Fit on the training data
        imputer.fit(trainX)

        # Transform both training and testing data
        trainX = imputer.transform(trainX)
        test = imputer.transform(app_test)

        new_app_train = pd.DataFrame(data=trainX[:,:], columns=features[:])
        new_app_train['TARGET'] = trainY

        new_app_test = pd.DataFrame(data=test[:,:], columns=features[:])


        print(trainX.shape)
        print(trainY.shape)
        print(new_app_train.shape)
        print(new_app_test.shape)
        return new_app_train, new_app_test
    
    def get_corralation(self, df_corr):
        df_correlations = df_corr.corr()['TARGET'].sort_values()
        print('Most Positive Correlations:\n', df_correlations.tail(10))
        print('\n\nMost Negative Correlations:\n', df_correlations.head(10))
        return df_correlations
    
    def make_polynomial_features(self,app_train,app_test):
        poly_features = app_train
        poly_features_test = app_test

        poly_target = poly_features['TARGET']
        poly_features = poly_features.drop(columns = ['TARGET'])
        
        print('Polynomial Features shape - Before: ', poly_features.shape)
        print('Polynomial Target shape - Before: ', poly_target.shape)
        print('Polynomial Test shape - Before: ', poly_features_test.shape)
        
        # Create the polynomial object with specified degree
        poly_transformer = PolynomialFeatures(degree = 3)

        # Train the polynomial features
        poly_transformer.fit(poly_features)

        # Transform the features
        poly_features = poly_transformer.transform(poly_features)
        poly_features_test = poly_transformer.transform(poly_features_test)

        print('Polynomial Features shape - After: ', poly_features.shape)
        print('Polynomial Target shape: - After', poly_target.shape)
        print('Polynomial Test shape: - After', poly_features_test.shape)
        
        poly_features = pd.DataFrame(poly_features, 
                                     columns = poly_transformer.get_feature_names(['EXT_SOURCE_3', 'EXT_SOURCE_2','EXT_SOURCE_1', 'DAYS_BIRTH', 'REGION_RATING_CLIENT_W_CITY']))

        # Add in the target
        poly_features['TARGET'] = poly_target

        # Find the correlations with the target
        poly_corrs = poly_features.corr()['TARGET'].sort_values()

        print('Polynomial Features shape: ', poly_features.shape)
        print('Polynomial Target shape: ', poly_target.shape)
        print('Polynomial Test shape: ', poly_features_test.shape)

        # Display most negative and most positive
        print("Most Negative Correlations:")
        print(poly_corrs.head(10))
        print("\nMost Positive Correlations:")
        print(poly_corrs.tail(10))        
        return poly_features, poly_features_test
    
    def scaling_value_treatment(self, poly_features_train, poly_features_test):
        seed = 10
        trainY_poly = poly_features_train['TARGET']

        if 'TARGET' in poly_features_train:
            trainX_poly = poly_features_train.drop(columns = ['TARGET'])
        else:
            trainX_poly = poly_features_train.copy()

        # Feature names
        features = list(trainX_poly.columns)

        # Copy of the testing data
        test_poly = poly_features_test.copy()

        # Median imputation of missing values
        imputer = SimpleImputer(strategy = 'median')

        # Scale each feature to 0-1
        scaler = MinMaxScaler(feature_range = (0, 1))

        # Fit on the training data
        imputer.fit(trainX_poly)

        # Transform both training and testing data
        trainX_poly = imputer.transform(trainX_poly)
        test_poly = imputer.transform(test_poly)


        # Repeat with the scaler
        scaler.fit(trainX_poly)
        trainX_poly = scaler.transform(trainX_poly)
        test = scaler.transform(test_poly)

        X_train, X_test, y_train, y_test = train_test_split(trainX_poly, trainY_poly, test_size=0.33, random_state=seed)

        print(trainX_poly.shape)
        print(trainY_poly.shape)

        print(X_train.shape)
        print(y_train.shape)
        
        return X_train, X_test, y_train, y_test
    
    def handling_imbalanced_data(self, X_train, y_train):
        sm = SMOTE(random_state=2)
        X_resampled, y_resampled = sm.fit_resample(X_train, y_train.ravel())

        X_resampled
        y_resampled

        print(X_resampled.shape)
        print(y_resampled.shape)
        return X_resampled, y_resampled
    
    def imp_logistic_regression_model(self, X_train, y_train):
        log_reg = LogisticRegression(C = 0.0001)
        log_reg.fit(X_train, y_train)
        return log_reg

    def accuracy_logistic_regression_model(self, X_train, y_train, log_reg):
        # get prediction accuracy
        crossVal = cross_val_score(log_reg, X_train, y_train, cv=5)
        print("Training Cross Validation: %0.2f" % (sum(crossVal) / float(len(crossVal)) * 100), "%")
        accuracy = log_reg.score(X_test, y_test)
        print("Test Accuraccy: %0.2f" % (accuracy * 100), "%")

    def score_logistic_regression_model(self, y_test, y_predicted):    
        #F1 Score
        print("F1 Score: %0.2f" % f1_score(y_test, y_predicted))
        #the size of the intersection divided by the size of the union of two label sets. 
        #If the entire set of predicted labels for a sample strictly match with the true set of labels, 
        #then the subset accuracy is 1.0; otherwise it is 0.0.
        jaccard_score_ = jaccard_score(y_test, y_predicted)
        print("Jaccard score: %0.2f" % jaccard_score_)

    
    def roc_logistic_regression_model(self, X_test, y_test):
        scores = log_reg.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = metrics.roc_curve(y_test, scores)
        roc_auc = metrics.auc(fpr, tpr)
        return scores, fpr, tpr, roc_auc