In [5]:
#### Libraries ####
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import ipywidgets as widgets
import qgrid
import pandas_profiling
import time
import sklearn

#Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier




  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [None]:
#### Settings ####
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 400) 
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_colwidth', -1)
%matplotlib inline
sns.set_style("whitegrid")
sns.set_context("notebook")


In [4]:
#### Function Library ####

def print_bold(string):
    print('\033[1m' + string )
    print('\033[0m')

### Inital steps ###
def examine_df(df):
    print(df.info())
    print(df.shape)
    print(df.head())
    print(df.describe())
    
    return pandas_profiling.ProfileReport(df)

def write_result_to_csv(df):
    df.to_csv('result.csv', sep=',')
    
def distribution_plot(df, column_name):
    sns.distplot(df[column_name])
    return plt.figure()
    
def joint_plot(df,x,y):
    return sns.jointplot(x = x, y=y, data = df, kind = 'reg')

def pair_plot(df):
    return sns.pairplot(df, kind = 'reg')

### Pre-Processing ###

def seperate_components(df, column_of_y):
    print('Note: let x,y = function to define globally')
    time.sleep(2)
    array = df.values
    if ((int(len(array[0]))) - 1) != column_of_y:
        print('Warning: adapt function if y not last column!')
    X = array[:,0:column_of_y]
    Y = array[:,column_of_y]
    return X,Y

def rescale(x):
    print('Note: let rescaledX = rescale(X) to define globally')
    time.sleep(2)
    scaler=MinMaxScaler(feature_range=(0,1))
    rescaledX=scaler.fit_transform(x)
    return rescaledX

def standardize(x):
    print('Note: let standardizedX = standardize(X) to define globally')
    time.sleep(2)
    scaler= StandardScaler().fit(x)
    rescaledX = scaler.transform(x)
    return standardizedX

def normalize(x):
    print('Note: let normalizedX = normalize(X) to define globally')
    time.sleep(2)
    normalizedX = Normalizer().fit_transform(x)
    return normalizedX

def binarize(x,threshold):
    print('Note: let binarizedX = binarize(X) to define globally and set threshold to value required')
    time.sleep(2)
    binaryX = Binarizer(threshold=threshold).fit_transform(x)
    return binaryX

def encode(df,name_of_column,new_name):
    print('Note: let df = encode(X,name_of_column,new_name) to define globally')
    time.sleep(2)
    df[name_of_column]=LabelEncoder().fit_transform(df[new_name])
    return df
    
def get_dummies(df, column_name):
    print('Note: let df = get_dummies(df,name_of_column) to define globally')
    time.sleep(2)
    print(df[column_name].unique())
    gen_features = pd.get_dummies(df[column_name],prefix = column_name, prefix_sep= '_',drop_first = True)
    df = pd.concat([df,gen_features], axis=1)
    df = df.drop([column_name], axis=1)
    return df

### Feature Selection ###

def univariate_chi(x,y,df,target_var, k=4):
    test = SelectKBest(score_func=chi2,k=k)
    fit = test.fit(x,y)
    print_bold('Univariate Scores')
    score = list(fit.scores_)
    columns = (list(df.columns.values))
    columns.remove(target_var)
    results = pd.DataFrame(columns=columns)
    results.loc[''] = score
    print(f'The {k} attributes with highest scores are: ')
    count = 1
    while count <= k:
        max_value = results.idxmax(axis=1)
        print(f'{count}: ' + max_value.values)
        results = results.drop(columns = max_value.values)
        count += 1
    print('------------')

def recursive_elimination(x,y,df,target_var, k=3):
    model = LogisticRegression(solver='liblinear')
    
    rfe = RFE(model,k)
    fit = rfe.fit(x,y)
    print_bold(f'Recursive Scores')
    columns = (list(df.columns.values))
    columns.remove(target_var)
    score = list(fit.ranking_)
    score = list(map(int, score))
    results = pd.DataFrame(columns = columns)
    results.loc[''] = score
    print(f'The {k} attributes with highest scores are: ')
    count = 1
    while count <= k:
        min_value = results.astype('float64').idxmin(axis=1)
        print(f'{count}: ' + min_value.values)
        results = results.drop(columns = min_value.values)
        count += 1
    print('------------')
    
def pca(x,k=3):
    pca = PCA(n_components=k)
    pca_fit = pca.fit(x)
    print(f"Explained variance: {pca_fit.explained_variance_ratio_}")
    print()
    print("Principal Components have little resemblance to the source data attributes")
    print()
    print(pca_fit.components_)

def extra_trees(x,y,df,target_var,estimators=100):
    model = ExtraTreesClassifier(n_estimators=estimators)
    model.fit(x,y)
    print_bold('Feature Importance Scores')
    score = list(model.feature_importances_)
    columns = (list(df.columns.values))
    columns.remove(target_var)
    results = pd.DataFrame(columns=columns)
    results.loc[''] = score
    np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
    print(f'The importance of attributes in descending order: ')
    print()
    print(round((results.max().sort_values(ascending=False)),3))
    print('------------')

In [None]:
#### Data Import ####

In [None]:
### Calculations ### 