In [3]:
#### Libraries ####
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import ipywidgets as widgets
import qgrid
import pandas_profiling
import time
import sklearn
import math
import chart_studio.plotly as py

#Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

#Model Evaluation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit

#Performance Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ROCAUC
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression

#clustering
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance
from yellowbrick.model_selection import LearningCurve
from sklearn import metrics
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering





In [17]:
#### Settings ####
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 400) 
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_colwidth', -1)
%matplotlib inline
sns.set_style("whitegrid")
sns.set_context("notebook")


In [28]:
#### Function Library ####

def print_bold(string):
    print('\033[1m' + string )
    print('\033[0m')

### Inital steps ###
def examine_df(df):
    print(df.info())
    print(df.shape)
    print(df.head())
    print(df.describe())
    
    return pandas_profiling.ProfileReport(df)

def write_result_to_csv(df):
    df.to_csv('result.csv', sep=',')
    
def distribution_plot(df, column_name):
    sns.distplot(df[column_name])
    return plt.figure()
    
def joint_plot(df,x,y):
    return sns.jointplot(x = x, y=y, data = df, kind = 'reg')

def pair_plot(df):
    return sns.pairplot(df, kind = 'reg')

### Pre-Processing ###

def seperate_components(df, column_of_y):
    print('Note: let x,y = function to define globally')
    time.sleep(2)
    array = df.values
    if ((int(len(array[0]))) - 1) != column_of_y:
        print('Warning: adapt function if y not last column!')
    X = array[:,0:column_of_y]
    Y = array[:,column_of_y]
    return X,Y

def rescale(x):
    print('Note: let rescaledX,x_scaled_fit = rescale(X) to define globally')
    time.sleep(2)
    scaler=MinMaxScaler(feature_range=(0,1))
    x_scaled_fit = scaler.fit(x)
    rescaledX=scaler.fit_transform(x)
    return rescaledX, x_scaled_fit

def standardize(x):
    print('Note: let standardizedX = standardize(X) to define globally')
    time.sleep(2)
    scaler= StandardScaler().fit(x)
    rescaledX = scaler.transform(x)
    return standardizedX

def normalize(x):
    print('Note: let normalizedX = normalize(X) to define globally')
    time.sleep(2)
    normalizedX = Normalizer().fit_transform(x)
    return normalizedX

def binarize(x,threshold):
    print('Note: let binarizedX = binarize(X) to define globally and set threshold to value required')
    time.sleep(2)
    binaryX = Binarizer(threshold=threshold).fit_transform(x)
    return binaryX

def encode(df,name_of_column,new_name):
    print('Note: let df = encode(X,name_of_column,new_name) to define globally')
    time.sleep(2)
    df[name_of_column]=LabelEncoder().fit_transform(df[new_name])
    return df
    
def get_dummies(df, column_name):
    print('Note: let df = get_dummies(df,name_of_column) to define globally')
    time.sleep(2)
    print(df[column_name].unique())
    gen_features = pd.get_dummies(df[column_name],prefix = column_name, prefix_sep= '_',drop_first = True)
    df = pd.concat([df,gen_features], axis=1)
    df = df.drop([column_name], axis=1)
    return df

### Feature Selection ###

def univariate_chi(x,y,df,target_var, k=4):
    test = SelectKBest(score_func=chi2,k=k)
    fit = test.fit(x,y)
    print_bold('Univariate Scores')
    score = list(fit.scores_)
    columns = (list(df.columns.values))
    columns.remove(target_var)
    results = pd.DataFrame(columns=columns)
    results.loc[''] = score
    print(f'The {k} attributes with highest scores are: ')
    count = 1
    while count <= k:
        max_value = results.idxmax(axis=1)
        print(f'{count}: ' + max_value.values)
        results = results.drop(columns = max_value.values)
        count += 1
    print('------------')

def recursive_elimination(x,y,df,target_var, k=3):
    model = LogisticRegression(solver='liblinear')
    
    rfe = RFE(model,k)
    fit = rfe.fit(x,y)
    print_bold(f'Recursive Scores')
    columns = (list(df.columns.values))
    columns.remove(target_var)
    score = list(fit.ranking_)
    score = list(map(int, score))
    results = pd.DataFrame(columns = columns)
    results.loc[''] = score
    print(f'The {k} attributes with highest scores are: ')
    count = 1
    while count <= k:
        min_value = results.astype('float64').idxmin(axis=1)
        print(f'{count}: ' + min_value.values)
        results = results.drop(columns = min_value.values)
        count += 1
    print('------------')
    
def pca(x,k=3):
    pca = PCA(n_components=k)
    pca_fit = pca.fit(x)
    print(f"Explained variance: {pca_fit.explained_variance_ratio_}")
    print()
    print("Principal Components have little resemblance to the source data attributes")
    print()
    print(pca_fit.components_)

def extra_trees(x,y,df,target_var,estimators=100):
    model = ExtraTreesClassifier(n_estimators=estimators)
    model.fit(x,y)
    print_bold('Feature Importance Scores')
    score = list(model.feature_importances_)
    columns = (list(df.columns.values))
    columns.remove(target_var)
    results = pd.DataFrame(columns=columns)
    results.loc[''] = score
    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print(f'The importance of attributes in descending order: ')
    print()
    print(round((results.max().sort_values(ascending=False)),3))
    print('------------')
    
### Model Evaluation ###

def similarity_of_split(train,test,target_var):
    print('TBD')
    
def test_split(x,y,test_size, seed=7):
    #print('Note: let X_train, X_test, Y_train, Y_test = test_split(X,Y, 0.3) to define globally')
    time.sleep(2)
    X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size = test_size,random_state = 7)
    
    # Let's do the log regresssion
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train,Y_train)

    # Now let's find the accurary with the test split
    result = model.score(X_test, Y_test)
    print(f'Test: {test_size} Train: {1-test_size}')
    print(f'Accuracy {result*100:5.3f}')
    print('-------------')
    print()
    
    return X_train, X_test, Y_train, Y_test

def k_fold_cross_val(x,y,splits=10,scoring = 'accuracy',add_info= False):
    kfold= KFold(n_splits=splits, random_state=7)
    model= LogisticRegression(solver="liblinear")
    if add_info:
        scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1'}
        results = cross_validate(model, x, y, scoring=scoring, cv=kfold)
        print(f'Logistic regression, k-fold {splits:d}')
        print(f'Accuracy {results["test_accuracy"].mean()*100:.3f}%')
        print(f'Precision {results["test_precision"].mean()*100:.3f}%')
        print(f'Recall {results["test_recall"].mean()*100:.3f}%')
        print(f'F1 {results["test_f1"].mean()*100:.3f}%')
    else:
        scoring=scoring
        results = cross_val_score(model, x,y,cv=kfold,scoring=scoring)
        print(f'Logistic regression, k-fold {splits:d} - {scoring}')
        if scoring == 'accuracy':
            print(f'{results.mean()*100:5.3f}% ({results.std()*100:5.3f}%)')
        else:
            print(f'{results.mean():5.3f}')
    return model

              
def leave_one_out(x,y):
    loo= LeaveOneOut()
    model = LogisticRegression(solver='liblinear')
    
    results = cross_val_score(model, x,y,cv=loo)
    print(f'Logistic regression, Leave one out - Accuracy {results.mean()*100:5.3f}% ({results.std()*100:5.3f}%)')

def repeated_test_train(x,y, test_size,repetitions=10):
    shuffle=ShuffleSplit(n_splits=repetitions,test_size=test_size, random_state=7)
    model= LogisticRegression(solver="liblinear")
    res = cross_val_score(model,x,y,cv=shuffle)
    
    print(f'Log Regression - Repeated Test-Train {nrepeat:d} - Accuracy {res.mean()*100:5.3f}% {res.std()*100:5.3f}%')
    

### Performance Metrics ###

def precision_recall_curve(x,y,test_size):
    model = k_fold_cross_val(x,y,add_info=True)
              
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=7)
    
    viz=PrecisionRecallCurve(model)
    viz.fit(X_train, Y_train)
    viz.score(X_test,Y_test)
    viz.show()

def area_under_roc(x,y,test_size):
    model = k_fold_cross_val(x,y, scoring='roc_auc')
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=7)

    viz=ROCAUC(model, classes=[0,1])
    viz.fit(X_train, Y_train)
    viz.score(X_test,Y_test)
    viz.show()

def con_matrix(x,y,test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=7)

    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)

    c_matrix=confusion_matrix(y_test, y_predicted)
    print_bold("Confusion Matrix")

    print()
    print(f'Accuracy {model.score(X_test, y_test)*100:.5f}')
    print(f'Accuracy check with conf. matrix {(c_matrix[0,0]+c_matrix[1,1])/c_matrix.sum()*100:.5f}')

    cm = ConfusionMatrix(model, classes=["Not present","Present"])
    # cm.fit(X_train, y_train)  #only if the model is not fitted

    cm.score(X_test, y_test)
    cm.show()

def class_report(x,y, test_size=0.3):
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=7)
    
    model = LogisticRegression(solver='liblinear')
    model.fit(X_train, Y_train)

    Y_predicted = model.predict(X_test)

    report = classification_report(Y_test, Y_predicted, digits=5)

    print(f'Accuracy {model.score(X_test, Y_test)*100:.5f}')
    print()
    print(report)
              
def mean_abs_error(X,Y):
    kfold = KFold(n_splits=10, random_state=7)
    model = LinearRegression()
    scoring = "neg_mean_absolute_error"
    res = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    print(f'Linear Regression, MAE: {res.mean():.3f} ({res.std():.3f})')

def mean_squared_error(X,Y):
    kfold = KFold(n_splits=10, random_state=7)
    model = LinearRegression()
    scoring = "neg_mean_squared_error"
    res = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

    print(f'Linear Regression, MSE: {res.mean():.3f} ({res.std():.3f})')
    print(f'Linear Regression, MSE: {math.sqrt(abs(res.mean())):.3f} ({math.sqrt(res.std()):.3f})')

def r_2(X,Y):
    kfold = KFold(n_splits=10, random_state=7)
    model = LinearRegression()
              
    scoring = "r2"
    res = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

    print(f'Linear Regression, R2: {res.mean():.3f} ({res.std():.3f})')

### Clustering ###

def scale_cluster_df(x):
    x_scaled,x_scaled_fit = rescale(x)
    X_scaled = pd.DataFrame(x_scaled,columns=x.columns)
    return X_scaled,x_scaled_fit        
              
def elbow(x_scaled):
    plt.figure(figsize=(12,9))
    model= KMeans()
    
    visualizer = KElbowVisualizer(model, k=(1,8))
    visualizer.fit(x_scaled)       
    visualizer.show() 
     
def cluster_model(x_scaled, clusters):
    model=MiniBatchKMeans(n_clusters=clusters).fit(x_scaled)
    return model
              
def silhoutte_visual(x_scaled, model):
    plt.figure(figsize=(12,9))

    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer.fit(x_scaled)      
    visualizer.show()

def inter_cluster_dist(x_scaled, model):
    plt.figure(figsize=(12,9))
    visualizer = InterclusterDistance(model, min_size=10000)
    visualizer.fit(x_scaled)
    visualizer.show()     

def accuracy_clustering(x_scaled, y, model):
    model=MiniBatchKMeans(n_clusters=3)

    model.fit(X_scaled)
    
    print(" ---- Accuracy Scores ----")

    acc_score=accuracy_score(y.values,model.predict(x_scaled))
    print(f'Accuracy {acc_score*100:.3f}')

def silh_sco(x_scaled, model):
    she=silhouette_score(x_scaled, model.labels_, metric='euclidean')
    print(f'Silhouette score {she:5f}')

def centroid_meaning(model, x, x_scaled):
    model.labels_
    model.cluster_centers_

    centroids_rescaled = pd.DataFrame(model.cluster_centers_, columns=x.columns)
    # rescale to original
    centroids_original = pd.DataFrame(x_scaled_fit.inverse_transform(model.cluster_centers_),columns=x.columns)
    print_bold('Originally scaled centroids')
    return centroids_original
              
def aggl_cluster(x_scaled,clusters,y):
    plt.figure(figsize=(17,9))

    # create dendrogram
    dn = sch.dendrogram(sch.linkage(x_scaled, method='ward'), no_labels=True)
    plt.show()
    # create clusters
    hc = AgglomerativeClustering(n_clusters=clusters, affinity = 'euclidean', linkage = 'ward')
    y_hc = hc.fit_predict(X_scaled)
    hc.labels_
    y.values
    dk={0:2,1:0,2:1}
    acc_score=accuracy_score(list(map(lambda x:dk[x],y.values)),hc.labels_)
    print(f'Accuracy {acc_score*100:.3f}')


In [None]:
#### Data Import ####

In [None]:
### Calculations ### 