In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
import sklearn

In [3]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, roc_auc_score, roc_curve
from sklearn.cross_validation import cross_val_score, cross_val_predict,train_test_split
from sklearn.decomposition import FactorAnalysis, PCA
from sklearn.preprocessing import imputation, StandardScaler, minmax_scale, LabelEncoder, Normalizer, Binarizer

  from numpy.core.umath_tests import inner1d


In [5]:
# Code to get HEATMAP of Co-realtion along with values
def fun_heatmap(corr): # corr is the object for holding 'corelation' matrix
    plt.figure(figsize=(14,10))
    sns.heatmap(corr, vmax=1, square=True, annot=True, 
                xticklabels=corr.columns.values, 
                yticklabels=corr.columns.values, cmap = 'cubehelix')
    plt.show()

In [6]:
# Funtion to open, read and convert its components into integer as a list
def read_csv(csv_file):  # csv_file to hold CSV format file
    f = open(csv_file, "r")  # open this file into read format
    
    text = f.read()   # read this file into text as string
    
    string_list = text.split("\n")   # to split this text basis new line i.e. "\n" 
    
    new_string_list = string_list[1:]    # Since first row can be header, thus to avoid 1st row
    
    final_list = []
    
    for i in new_string_list:
        another_list = i.split(",")
        
        int_list = []
        
        for j in another_list:
            int_list.append(int(j))
        
        final_list.append(int_list)
    
    return final_list

In [7]:
# Basis specific Index/Column in a list, calculate respective values and put them in a Dictionary
def calc_count(input_list, index):
    dict_to_return = {}
    
    for i in input_list:
        value = i[-1]
        keys = i[index]
        
        if keys in dict_to_return.keys():
            dict_to_return[keys] = dict_to_return[keys] + value
        else:
            dict_to_return[keys] = value
        
    return dict_to_return

In [12]:
# Function to Plot Confusion Matrix
def fun_plot_confusion_matrix(conf):
    plt.rcParams['figure.figsize'] = 4,3
    cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
    # add ticklabels as per classes; for 3 classes - xticklabels=['0','1','2']
    sns.heatmap(conf,cmap = cmap,xticklabels=['0','1'],yticklabels=['0','1'],annot=True, fmt="d",)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

In [13]:
# Function to Calculate & Print TPR / FPR / Specificity / Precision basis different Confusion-Matrix developed
def fun_conf_mat_calc(conf):
    conf_accuracy_score = np.round(((conf.item(0) + conf.item(3)) / conf.sum()) * 100, 2)
    print('Overall Accuracy:', conf_accuracy_score)
    
    TPR = Recall = Sensitivity = np.round((conf.item(3) / (conf.item(2) + conf.item(3)))*100,2)
    print('TPRate or Recall or Sensitivity i.e. (TP / Actual YES):', TPR)
    
    FPR = np.round((conf.item(1) / (conf.item(0) + conf.item(1)))*100,2)
    print('FPRate i.e. (FP / Actual NO):', FPR)
    
    Specificity = np.round((conf.item(0) / (conf.item(0) + conf.item(1)))*100,2)
    print('Specificity i.e. (TN / Actual NO):', Specificity)
    
    Precision = np.round((conf.item(3) / (conf.item(1) + conf.item(3)))*100,2)
    print('Precision i.e. (TP / Predicted YES):', Precision)
    
    #return (zip(('TPR', 'FPR', 'Specificity', 'Precision'),(TPR, FPR, Specificity, Precision)))

In [14]:
# Function to calculate TPR / Recall basis differet Confusion Matrix
def fun_TPR_calc(conf):
    TPR = Recall = Sensitivity = np.round((conf.item(3) / (conf.item(2) + conf.item(3)))*100,2)
    return(TPR)

# Function to calculate FPR
def fun_FPR_calc(conf):
    FPR = np.round((conf.item(1) / (conf.item(0) + conf.item(1)))*100,2)
    return(FPR)

# Function to calculate Specificity 
def fun_Spec_calc(conf):
    Spec = np.round((conf.item(0) / (conf.item(0) + conf.item(1)))*100,2)
    return(Spec)

# Function to calculate Precision
def fun_Prec_calc(conf):
    Prec = np.round((conf.item(3) / (conf.item(1) + conf.item(3)))*100,2)
    return(Prec)

In [15]:
# Creating Function for plotting Variance Ratio
def fun_plot_variance_ratio_cumsum(variance_ratio):
    variance_ratio_cumsum = np.cumsum(np.round(variance_ratio, decimals=4)*100)
    print(variance_ratio_cumsum)
    
    plt.rcParams['figure.figsize'] = 6,4
    plt.plot(variance_ratio_cumsum)
    plt.title("Cumulative Variance")
    plt.xlabel("No. of Components")
    plt.ylabel("Variance Explained")

In [16]:
# Function to find correct k in knn basis range from 1 to 25
def fun_evaluate_k_in_knn(X, Y):
    k_range =range(1,25)
    k_scores = []
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors = k)
        scores = cross_val_score(knn, X, Y, cv = 10, scoring = 'accuracy')
        k_scores.append(scores.mean())
    print(k_scores)

In [17]:
# Funtion to get all Numeric Variables from DF
def fun_get_numeric_variables(df):
    numeric_variables = list(df.dtypes[df.dtypes != 'object'].index)
    return df[numeric_variables]

# Funtion to get all Non-Numeric Variables from DF
def fun_get_non_numeric_variables(df):
    non_numeric_variables = list(df.dtypes[df.dtypes == 'object'].index)
    return df[non_numeric_variables]