# The first part of the assignment, IDS 2020-2021


<font color="red"><b>Student Names and IDs:
    
    Mridul Mani Tripathi - 403587

In [None]:
# to convert numbers to words (for Question 2)
!pip install inflect

In [None]:
# basic stack as defined in software
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import tree, metrics
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, normalize
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
import numpy as np
from scipy import stats
import seaborn as sns
from p_decision_tree.DecisionTree import DecisionTree
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering

# internals
import inspect
from typing import NewType
import itertools
import functools as func
import operator

# inflect needed to convert numbers to words (for Question 2)
import inflect

# seaborn for distribution
import seaborn as sns

# hide warnings
import warnings
warnings.filterwarnings('ignore') 

# random seed
rand_seed = 403596

plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 500
pd.set_option('display.max_colwidth', -1)

In [None]:
IdsType = NewType('IdsType',str)

# definitions

class IdsTypes:
    time = IdsType('Time')
    num = IdsType('Numerical')
    cat = IdsType('Categorical')
    bl = IdsType('Boolean')

class Column(str):  
    def __new__(cls,name:str,idstype:IdsType):
        obj = str.__new__(cls,name)                                    # for accessing column
        obj.idstype = idstype
        return obj
    
class CatCol(Column):
    def __new__(cls,name:str):
        obj = Column.__new__(cls,name, IdsTypes.cat)
        return obj
    
    def convert(self,ds_col):
        return pd.Categorical(ds_col)                                  # to convert column values  
    
class NumCol(Column):
    def __new__(cls,name:str):
        obj = Column.__new__(cls,name, IdsTypes.num)
        return obj
    
    def convert(self,ds_col):
        return pd.to_numeric(ds_col)                                  # to convert column values

class BoolCol(Column):
    def __new__(cls, name:str):
        obj = Column.__new__(cls,name,IdsTypes.bl)
        return obj

    def convert(self, col):
        return col.astype('bool')                                     # to convert column values           


In [None]:
# definitions
# for reading and loading data sets

class Dataset:
    file_name = 'dataset.csv'
    sampled_file_name = 'sampled_data.csv'
    __original_ds__ = None
    __sampled_ds__ = None
    
    def load_original(force_reload = False):
        if force_reload or Dataset.__original_ds__ is None:
            Dataset.__original_ds__ = Dataset.apply_categories(pd.read_csv(Dataset.file_name))
        return Dataset.__original_ds__.copy()
    
    def load_sampled(force_reload = False):        
        if force_reload or Dataset.__sampled_ds__ is None:
            Dataset.__sampled_ds__ = Dataset.apply_categories(pd.read_csv(Dataset.sampled_file_name))
        return Dataset.__sampled_ds__.copy()
    
    def apply_categories(df):
        for w in Dataset.Cols.as_set():
                df[w] = w.convert(df[w])
        return df.set_index(Dataset.Cols.id)
    
    class Cols:
        id = NumCol('ID')
        surfaceR = NumCol('SurfaceR')
        numberR = NumCol('NumberR')
        typeR = CatCol('TypeR')
        vegetationR = NumCol('VegetationR')
        surroundings1 = CatCol('Surroundings1')
        surroundings2 = CatCol('Surroundings2')
        surroundings3 = CatCol('Surroundings3')
        useR = NumCol('UseR')
        fishingR = NumCol('FishingR')
        acessR = NumCol('AcessR')
        roadDistanceR = NumCol('RoadDistanceR')
        buildingR = NumCol('BuildingR')
        pollutionR = NumCol('PollutionR')
        shoreR = CatCol('ShoreR')
        green_frogs = BoolCol('Green frogs')
        brown_frogs = BoolCol('Brown frogs')
        common_toad = BoolCol('Common toad')
        fire_toad = BoolCol('Fire-bellied toad')
        tree_frogs = BoolCol('Tree frog')
        common_newt = BoolCol('Common newt')
        great_newt = BoolCol('Great crested newt')
        __intset__ = None
        __transdict__ = None
        
        def _intset_():
            if Dataset.Cols.__intset__ is None:
                Dataset.Cols.__intset__ = frozenset({val for _,val in inspect.getmembers(Dataset.Cols, lambda attr:isinstance(attr,Column))})
            return Dataset.Cols.__intset__
            
            
        def _transdict_():
            if Dataset.Cols.__transdict__ is None:
                Dataset.Cols.__transdict__ = {v:v for v in Dataset.Cols.as_set()}
            return Dataset.Cols.__transdict__
            
        def as_set():
            return {v for v in Dataset.Cols._intset_()}
        
        def as_list():
            return [v for v in Dataset.Cols._intset_()]
    
    

In [None]:
# definitions

class TTSGroup:
    class TTS:
        def __init__(self,train,test):
            self.train = train
            self.test = test

    def __init__(self, X_train, X_test, y_train, y_test):
        self.X = TTSGroup.TTS(X_train, X_test)
        self.y = TTSGroup.TTS(y_train, y_test)


In [None]:
# Since we are interested in the values of the individual frogs and not the overall combination of frogs, we created a class which extracts individual accuracy scores automaticly and implements comparison operators using the sum of all individual scores.

class MultiClassAccs:

    def __init__(self, true_vals, predicted_vals):
        self.individual_accs = {feature:metrics.accuracy_score(vals, [a[i] for a in predicted_vals]) for i, (feature,vals) in enumerate(true_vals.iteritems())}
        self.total_combination_acc = metrics.accuracy_score(true_vals, predicted_vals)

    def total(self):
        return sum(self.individual_accs.values())

    def __lt__(self, other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() < other_val

    def __le__(self,other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() <= other_val

    def __eq__(self, other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() == other_val

    def __ne__(self,other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() != other_val

    def __gt__(self,other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() > other_val
    
    def __ge__(self,other):
        if isinstance(other, int):
            other_val = other
        else:
            other_val = other.total()
        return self.total() >= other_val

    def __str__(self):
        return "Combined: {}, {}".format(self.total_combination_acc, str(self.individual_accs))

In [None]:
#_ = Dataset.load_original(True)
#_ = Dataset.load_sampled(True)

## Preprocessing of the Dataset (5 points)
 Carry out the following preprocessing steps before starting the analysis:
 - Select 90% of dataset provided for this assignment by random sampling.
     - Use one of the group member's student numbers as a seed.
     - Rename the new generated dataset (which contains 90% of the data) to "sampled_data".
 - <font color='red'>Important!</font>  Export your *sampled_data* dataset and submit it with your assignment solution.
 - If it is not otherwise mentioned, you should always use your below created *sampled_data* as input for the questions.

In [None]:
orig_ds = Dataset.load_original()                                  # loading original dataset 
sampled_ds = orig_ds.sample(frac=0.9, random_state=rand_seed)      # random sampling of data set
sampled_ds.to_csv(Dataset.sampled_file_name, index=True)           # Converting and saving the dataframe to .csv

## Question 1 - Insights into the Data (15 points):

   (a)  Generate a dataset by removing those rows of the sampled_data dataset for which the value of "SurfaceR" is equal or bigger than 50000. Let's call this data set "new_sampled_data".

In [None]:
class Q1_a:
    
    def run(self, base):
        self.new_sampled_data = base[base[Dataset.Cols.surfaceR]<50000]       # Removing rows of sampled_data for which values
        return self                                                          # of 'SurfaceR' is equal or greater than 50000 
    
q1_a = Q1_a().run(Dataset.load_sampled())
q1_a.new_sampled_data.describe()

# export data in case it is required by automated testing
new_sampled_data = q1_a.new_sampled_data
new_sampled_data.to_csv("new_sampled_data.csv")

   (b)  Use a boxplot to find and remove the outliers from "SurfaceR". Note that based on the boxplot the values greater than the upper-whisker and lower than the lower-whisker are considered as outliers. Let's call the dataset after removing the outliers "cleaned_data". Now you should  have three datasets (sampled_data, new_sampled_data, and cleaned_data). 

In [None]:
class Q1_b:
    
    def run(self, df, df_name):
        Q1 = df['SurfaceR'].quantile(0.25)
        Q3 = df['SurfaceR'].quantile(0.75)
        IQR = Q3 - Q1                                                        # IQR is interquartile range
        
        # identifying outliers and filtering them
        filter = (df['SurfaceR'] >= Q1 - 1.5 * IQR) & (df['SurfaceR'] <= Q3 + 1.5 *IQR)
        self.cleaned_data = df.loc[filter]
        cd = 'cleaned_data'                                                  # data set without outliers 
        sax = self.cleaned_data.boxplot('SurfaceR', return_type='axes')      # box plot for cleaned_data
        sax.set_xlabel(cd)
        
        p, (ax1, ax2) = plt.subplots(1,2,sharey=True)
        p.suptitle("Side-by-side comparison of new_sampled_data and cleaned_data")
        df.boxplot('SurfaceR', ax=ax1)                                       # box plot for comparision 
        ax1.set_xlabel(df_name)
        self.cleaned_data.boxplot('SurfaceR',ax=ax2)
        ax2.set_xlabel(cd)
        p.tight_layout()
        return self
        
q1_b = Q1_b().run(new_sampled_data, "new_sampled_data")

# export data in case it is required by automated testing
cleaned_data = q1_b.cleaned_data 
cleaned_data.to_csv("cleaned_data.csv")

   (c) Compare basic statistical features of "SurfaceR" (median, mean, and mode, standard deviation, variance) in the new_sampled_data and cleaned_data datasets.    Interpret the differences for these statistical values between the cleaned_data and new_sampled_data datasets. Explain why the statistics of these two datasets are different.

In [None]:
# Statistical features of "SurfaceR" in cleaned_data and new_sampled_data

class Q1_c:
    
    target_col = Dataset.Cols.surfaceR

    def run(self,**dataframes):
        self.compared = pd.concat({name:self.get_stat_features(df) for name, df in dataframes.items()}, axis=1)
        return self

    def get_stat_features(self, df):
        target = df[Q1_c.target_col]
        md = stats.mode(target)
        add = pd.Series([np.median(target),np.mean(target), 
                         "{} (#{})".format(md.mode[0], md.count[0]),
                         np.std(target), np.var(target) ], 
                        index=['median', 'mean', 'mode', 'standard deviation','variance'  ])
        
        return add

# use exported data in case it is required by automated testing
q1_c = Q1_c().run(new_sampled_data = new_sampled_data, cleaned_data = cleaned_data)
q1_c.compared

Explanation: We can observe that the there is a huge influence of the outliers in the data on statistical features, if we observe the box plots of both the datasets, we see that some values of Surface R in new_sample_data reaches as far as 40000, which is well beyond the Interquartile range (IQR) and outside the upper whisker. These data samples will skew our decision boundary, even though the actual clustering is not in that direction. 
This further results in very high variance and standard deviaction, as outliers increase the value of | x - x_mean |^2, also the influence of these outliers also increase the mean, and median where as mode is unalterd since it is well within the IQR 

### Basic Visualization (10 points)
(d) Visualize mean and median of "SurfaceR" in the cleaned dataset. Specify the "Surroundings3" values for which the mean and median of "SurfaceR" is maximal and for which it is minimal.

In [None]:
# visualisation of cleaned_data

class Q1_d:
    
    target_col = Dataset.Cols.surfaceR
    values_col = Dataset.Cols.surroundings3


    def run(self, cleaned_df):
        self.scatter(cleaned_df)
        self.SurroundingsMetric(cleaned_df)
        self.bars(cleaned_df)
        return self

    def scatter(self, df):                                                                  # visualising mean and median
        plt.plot(df[Q1_d.target_col], 'o', label=Q1_d.target_col, color="#F5B08B")         # using scatter plot
        plt.title("Mean and median of {}".format(Q1_d.target_col))
        plt.axhline(df[Q1_d.target_col].mean(), label="mean", color="red")
        plt.axhline(df[Q1_d.target_col].median(), label="median", color="blue")
        plt.legend()

    def bars(self, df):
        fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
        fig.suptitle = "Analysis of {} values vs. {}".format(Q1_d.values_col, Q1_d.target_col)
        ax1.title.set_text("Mean")                                                         # visualising mean
        sns.barplot(x=Q1_d.values_col, y =Q1_d.target_col, data = cleaned_data, estimator = np.mean, ax=ax1)
        ax2.title.set_text("Median")                                                       # visualising median
        sns.barplot(x=Q1_d.values_col, y =Q1_d.target_col, data = cleaned_data, estimator = np.median, ax=ax2)
    
    def SurroundingsMetric(self, cleaned_df):                                              # for tabular representation
        df = cleaned_df.loc[:,[Dataset.Cols.surfaceR, Dataset.Cols.surroundings3]]
        mean = df.groupby(Dataset.Cols.surroundings3).mean()
        med = df.groupby(Dataset.Cols.surroundings3).median()
        idxInd = lambda x: x[0]
        metric_col = 'metric'
        mn = 'mean'
        md = 'median'
        ex_col = 'extreme'
        mini = 'min'
        maxi = 'max'
        self.result_df = pd.DataFrame({Dataset.Cols.surroundings3:[idxInd(ex) for ex in (mean.idxmin(), mean.idxmax(), med.idxmin(), med.idxmax())],metric_col:[mn, mn, md,md], ex_col:[mini,maxi,mini,maxi]})

# use exported data in case it is required by automated testing
q1_d= Q1_d().run(cleaned_data)
q1_d.result_df

Explanation: The mean and median of `SurfaceR` are minimal for value ' d ' of `Surroundings3`. The maximum value for both metrics is ' i '

   (e) Plot the distribution of "SurfaceR" in the new_sampled_data and cleaned_data datasets.

In [None]:
class Q1_e:
    
    target_col = Dataset.Cols.surfaceR
    def run(self, new_sampled_df, cleaned_df):
        fig, axes = plt.subplots(2,1, sharex=True, sharey=True)
        sns.histplot(new_sampled_df[Q1_e.target_col], color="blue", kde=True, ax=axes[0])    # For distribution of SurfaceR 
        axes[0].title.set_text("New sampled data")                                          # in new_sampled_data
        
        sns.histplot(cleaned_df[Q1_e.target_col], color="red", kde=True, ax=axes[1])         # For distribution of SurfaceR 
        axes[1].title.set_text("Cleaned data")                                              # in cleaned_data
        fig.tight_layout()                                                                 
        plt.show()
        return self
    
q1_e = Q1_e().run(new_sampled_data, cleaned_data)

   (f) Explore the distribution of "SurfaceR" and "AcessR" together in the new_sampled_data and cleaned_data datasets. Specify the ranges of "SurfaceR" and "AcessR" for which the frequency of the data is the highest.

In [None]:
# distribution of "SurfaceR" and "AcessR" together

class Q1_f:
    
    coi = [Dataset.Cols.surfaceR, Dataset.Cols.acessR]
    def run(self, **dataframes):
        self.bars(dataframes)
        self.dists(dataframes)
        return self

    def dists(self, dataframes):                                       # distribution plot
        p, axs = plt.subplots(1,2,  sharey=True)
        for i, (name, df) in enumerate(dataframes.items()):
            sns.kdeplot(x=Q1_f.coi[0], y = Q1_f.coi[1], data = df, ax = axs[i])
            axs[i].set_title(name)
        p.tight_layout


    def bars(self, dataframes):                                        # bar plot 
        for _, (x_val, y_val) in enumerate(itertools.permutations(Q1_f.coi,2)):
            fig, axes = plt.subplots(1,len(dataframes), figsize=(16,9), sharex=True, sharey=True)
            fig.suptitle(y_val + " count by " + x_val + " value")
            for i, (name, df) in enumerate(dataframes.items()):
                Q1_f.process_df(df, axes[i], x_val,y_val)
                axes[i].set_title(name)
            fig.tight_layout()
    
    def process_df(dfr, ax, x_val, y_val):
        df = dfr.loc[:, (x_val,y_val)]
        data = df.groupby(x_val).agg('count')
        ax.hist(data)
        ax.set_xlabel(x_val)
        ax.set_ylabel(y_val + " count")
            
q1_f = Q1_f().run(new_sampled_data = new_sampled_data, cleaned_data = cleaned_data)

Explanation:

For both datasets, the highest frequency happens when AcessR = 100 and SurfaceR = 0.

## Question 2 - Decision Trees (15 points):

   (a) Add a categorical column "number_frogs" to the new_sampled_data which indicate the number of different frogs in each region (row). For example, if in a row we have:
       - "Green frogs" = 1, "Brown frogs" = 1, "Common toad" = 0, "Fire-bellied toad" = 0, "Tree frog" = 0, "Common newt" = 0, and "Great crested newt" = 0, then "number_frogs" = 'two'.
       - "Green frogs" = 1, "Brown frogs" = 1, "Common toad" = 0, "Fire-bellied toad" = 0, "Tree frog" = 1, "Common newt" = 1, and "Great crested newt" = 0, then "number_frogs" = 'four'.

In [None]:
class Q2:
    
    class Cols:
        frogc = CatCol('number_frogs')                        # new column

In [None]:
class Q2_a:
    
    rows = frozenset({Dataset.Cols.green_frogs,               # columns for frog types
                      Dataset.Cols.brown_frogs,
                      Dataset.Cols.common_toad,
                      Dataset.Cols.fire_toad,
                      Dataset.Cols.tree_frogs,
                      Dataset.Cols.common_newt,
                      Dataset.Cols.great_newt})
    
    def run(self, df):
        p = inflect.engine()                                  # converting number to words
        pdct = {0:'zero', 1:'one', 2:'two', 3:'three', 4:'four', 5:'five', 6:'six', 7:'seven'}
        df[Q2.Cols.frogc] = Q2.Cols.frogc.convert(df[Q2_a.rows].sum(axis=1).transform(lambda x: p.number_to_words(x)))
        self.categorized_data = df
        return self

q2_a = Q2_a().run(new_sampled_data.copy(deep=True))
q2_a.categorized_data.head(2)

   (b) In the new dataset (created in Section 'a'), consider "TypeR", "VegetationR", "Surroundings1", "Surroundings2", "Surroundings3" as    the descriptive features and "number_frogs" as the target feature. Generate two decision trees. Let's call them "tree1" and "tree2". In tree1 set the minimum number of samples for splitting to 15 and in tree2 set the minimum number of samples for splitting to 1. Create both decision trees based on entropy.

In [None]:
class Q2_b:
    
    descriptive_features= [Dataset.Cols.typeR,                            # descriptive features
                                    Dataset.Cols.vegetationR,
                                    Dataset.Cols.surroundings1,
                                    Dataset.Cols.surroundings2,
                                    Dataset.Cols.surroundings3]
    target_feature = Q2.Cols.frogc                                        # target feature
    
    def run(self, df):
        self.tree1= Q2_b.descisionTree(df, 15)                            # minimum number of samples for splitting = 15
        self.tree2= Q2_b.descisionTree(df, 1)                             # minimum number of samples for splitting = 1
        return self
    
    def descisionTree(df, min_split):
        descriptive_data = df[Q2_b.descriptive_features].values.tolist()
        descriptive_data = [[str(x) for x in a] for a in descriptive_data]
        descriptive_features = df[Q2_b.descriptive_features].columns.tolist()
        target_data = df[Q2_b.target_feature].values.tolist()
        tree = DecisionTree(descriptive_data, descriptive_features, target_data, "entropy")
        tree.id3(0,min_split)

        return tree
    
q2_b = Q2_b().run(q2_a.categorized_data)
tree1 = q2_b.tree1
tree2 = q2_b.tree2

In [None]:
_ = tree1.print_visualTree(render=True)                                   # printing decision tree 1

In [None]:
_ = tree2.print_visualTree(render=True)                                   # printing decision tree 2

In [None]:
# printing entropy for decision trees

print("Entopy tree1: {}".format(tree1.entropy))
print("Entopy tree2: {}".format(tree2.entropy))

   (c) Consider tree1. What is the best attribute (based on entropy) for splitting the tree in the second round of ID3 regarding the value of the attribute chosen in the first round of ID3?    

Explanation: In the first round of ID3 the maximun information gain (least entropy)  can be found in TypeR feature, thus it is used for splitting. In the second round "surrounding3" would have the least entropy (maximun information gain), thus surrounding3 feature will be used for splitting the tree

   (d) Compare tree1 and tree2 in terms of the possibility of overfitting and the complexity of the decision trees.

Explanation: Since the minimum samples for splitting is set to 1 in Tree2, there is a overfitting in Tree2 as all the samples are splitted to it's class individualy (even per sample), the generality is lost, thus leading to complex decision tree. But in Tree1 we can observe a good fit as we set at least 15 samples needed for next split, thus maintaining generality.

## Question 3 - Regression (14 points):

For this question (Q3), create and use a restricted dataset by removing the columns "ID", "NumberR", "Surrounding1", "Surrounding2", "Surrounding3", "Common toad", "Fire-bellied toad", "Tree frog", "Common newt", "Great crested newt" from the sampled_data.

In this question, we consider "Green frogs" and "Brown frogs" to be potential target features, while all other features are potential descriptive features.

In [None]:
class Q3:
    
    target_features=[Dataset.Cols.green_frogs, Dataset.Cols.brown_frogs]             # target feature
    
    excluded_columns=[Dataset.Cols.numberR, Dataset.Cols.surroundings1,              # for restricted data set
                      Dataset.Cols.surroundings2, Dataset.Cols.surroundings3, 
                      Dataset.Cols.common_toad, Dataset.Cols.fire_toad, 
                      Dataset.Cols.tree_frogs, Dataset.Cols.common_newt, Dataset.Cols.great_newt]
    
    def __init__(self, base_df):                                                     
        self.descriptive_features = [c for c in Dataset.Cols.as_list() if c not in Q3.excluded_columns and c not in Q3.target_features and c is not Dataset.Cols.id]
        self.df = base_df.drop(columns= self.excluded_columns)                       # creating restricted data set


q3 = Q3(Dataset.load_sampled())
print("Descriptive features: {}".format(q3.descriptive_features))
print("Target features: {}".format(q3.target_features))
q3.df.to_csv("restricted_data.csv")
q3.df.head(2)

   (a) Which features are suitable as input for logistic regression? Which would need to be modified first? Explain your answers.

Explanation: Logistic regression is able to handle continuous and categorical features(after modifications). The suitable input for LR is continuous variable. Since the descriptive feature "TypeR" and "ShoreR" here uses categorical data we would have to modify these using one hot encoding/use dummies.

           Features suitable for input: SurfaceR, VegetationR, UseR, FishingR, AcessR, RoadDistanceR, BuildingR, PollutionR

           Features that need modification: TypeR, ShoreR

   (b) Implement and briefly motivate an adequate modification. Print the resulting data set limited to the first two data rows. 

In [None]:
# one hot encoding (creating dummy columns) for categorical descriptive features

class Q3_b:

    def run(self, base_df):
        self.encoded_df = pd.get_dummies(base_df)
        self.encoded_inputs = [v for v in self.encoded_df.columns.values if v not in Q3.target_features]
        return self
        
q3_b = Q3_b().run(q3.df)
q3_b.encoded_df.head(2)

Explanation: As explained in the previous cell, "TypeR" has categorical features, using the pandas.get_dummies(...), we can get one hot encoding with 0/1 values as feature values, which can now we use for logistic regression.

(c) We want to predict the presence of green frogs and brown frogs in the habitat, using a distinct logistic regression classifier for each frog type. 

Consider the set of features available in this question's unmodified data set (that is before Q3b). To get an overview of the data, choose and present some basic visualization as discussed in the lectures (e.g.  scatter matrix, scatter plots, charts, etc.). Based on this visualization, for each frog type choose the 4 most promising descriptive features to predict the presence of that frog type in the habitat. 

Explain your strategy and choices.

In [None]:
# Various plots for visualidation help in choosing promising descriptive features

class Q3_c:
    
    def __init__(self, df):
        self.df = df

    def run_pairplots(self):
        for target in Q3.target_features:
            defaults = {"hue":target, "palette":{1:target.split(' ')[0], 0:'grey'}, "corner":True}
            sns.pairplot(self.df,vars=q3.descriptive_features, kind='hist', **defaults)
            sns.pairplot(self.df,vars=[c for c in q3.descriptive_features if pd.api.types.is_numeric_dtype(self.df[c].dtypes)], kind='kde', **defaults)
        
    
    def run_smalplots(self):
        feats = [Dataset.Cols.typeR, Dataset.Cols.fishingR, Dataset.Cols.surfaceR, Dataset.Cols.pollutionR]
        p, ax = plt.subplots(len(Q3.target_features), len(feats), figsize=(16,9))
        for i, target in enumerate(Q3.target_features):
            for n, feature in enumerate(feats):
                sns.barplot(x=feature, y=target, data=self.df, ax=ax[i][n])
        p.tight_layout()
    
q3_c = Q3_c(q3.df)

In [None]:
q3_c.run_pairplots()

In [None]:
q3_c.run_smalplots()

Explanation:  From the visualization plots shown above, we should consider the features, that have high corelation with the target features. If we see the "Pairplots" we can observe that for both green and brown frogs there is a high corelation with:
1. Pollution 
2. TypeR 
3. FishingR
4. AcessR

   (d) For both frog types, train a logistic regression classifier to predict the presence of that frog type in the habitat. Use the descriptive features as chosen in Q3c. Apply the modification from Q3b if needed.

In [None]:
# training a logistic regression classifier to predict the presence of green frog and brown frog in the habitat

class Q3_d:

    def __init__(self, targets, descriptive, df):
        self.targets = targets
        self.descriptive = descriptive
        self.df = df
        self.target_names = list()
        self.classifiers = list()
        self.predictions = list()


    def run(self):
        df = self.df
        for target in self.targets:
            classifier = LogisticRegression(solver = 'liblinear', multi_class='ovr', random_state=rand_seed)
            classifier.fit(df[self.descriptive], df[target])
            prediction = classifier.predict(df[self.descriptive])
            self.target_names.append(target)
            self.classifiers.append(classifier)
            self.predictions.append(prediction)
        return self

# descriptive features selected from Q3c and created dummy columns for categorical descriptive features    
    
q3_d = Q3_d(Q3.target_features, [f for f in q3_b.encoded_inputs if f in {Dataset.Cols.acessR, Dataset.Cols.fishingR, Dataset.Cols.pollutionR} or f.startswith(Dataset.Cols.typeR)], q3_b.encoded_df).run()

In [None]:
pd.DataFrame({target:[q3_d.classifiers[i].coef_,  q3_d.classifiers[i].intercept_,q3_d.classifiers[i].score(q3_d.df[q3_d.descriptive], q3_d.df[target])] for i, target in enumerate(q3_d.targets)}, index=['Coefficient', 'Intercept', 'Score'])

   (e) For each of the two trained classifiers compute and print the confusion matrix.

In [None]:
# printing confusion matrices

for i, target in enumerate(q3_d.targets):
    print('Confusion Matrix for {}:\n{}'.format(target, metrics.confusion_matrix(q3_d.df[target], q3_d.predictions[i])))

(f) Based on the information computed in Q3 so far, interpret and evaluate the two models and compare them. Why are they similar/different? Would you recommend the models and why (not)? How do you think the applied methods could be improved to get better results?

In [None]:
# comparing models

for i, target in enumerate(q3_d.targets):
    df = q3_d.df
    print('##################\n{}'.format(target))
    print('Accuracy: ', q3_d.classifiers[i].score(df[q3_d.descriptive], df[target]))
    print('Classification report:')
    print(metrics.classification_report(df[target], q3_d.predictions[i]))


Explanation: For the first case (Green Frogs) we have score of 74.11% and if we observe the confusion matrix, we find a similarity that the True Positives high for both cases. But in contrast True Negatives are high for first case, but in the second case although score is 77.64% there are many False Positives are far less True Negetives. So these attributes Predict "Green Frogs" in habitat better than of "Brown Frogs". So these attribues are recommended for Green Frogs. But we need to ADD other features to predict Brown frogs better and more trainding data could be useful as well. From the classification report we can see that presence of both Green and Brown frogs in the habitat are well represented, but absence of Brown frogs in habitat is poorly represented.

## Question 4 - Support Vector Machines (8 points):

For this question (Q4), restrict your data set to the same features as in Q3. Similar to Q3, we want to train two distinct classifiers predicting the presence of green frogs and brown frogs in the habitat. 

In this question, we will use SVMs instead of logistic regression. In the following, consider *Green frogs* and *Brown frogs* to be potential target features, while all other features are potential descriptive features.

In [None]:
# selecting descriptive features and target features

class Q4:
    
    target_features= Q3.target_features
    excluded_columns=Q3.excluded_columns
    def __init__(self, base_df):
        self.descriptive_features = [c for c in Dataset.Cols.as_list() if c not in Q4.excluded_columns and c not in Q4.target_features and c is not Dataset.Cols.id]
        self.df = base_df.drop(columns= self.excluded_columns)

q4 = Q4(Dataset.load_sampled())
q4.df.head(2)

   (a) Which of the potential descriptive features are suitable as an input for SVMs and which need to be modified first? Modify the data as needed and provide a brief explanation. Print the first two data rows of the modified data set.

In [None]:
# one hot encoding (creating dummy columns) for categorical descriptive features

class Q4_a:

    def run(self, base_df):
        self.encoded_df = pd.get_dummies(base_df)
        self.encoded_inputs = [v for v in self.encoded_df.columns.values if v not in Q4.target_features]
        return self

q4_a = Q4_a().run(q4.df)
q4_a.encoded_df.head(2)


Explanation:  The input for SVM should be also continuous descriptive features, just like in Regression. So we copy the same dataframes from Regression, where categorical features like "ShoreR" and "TypeR" are converted into continuous data in the sense of one hot encoding.  

   (b) For each frog type, consider the same set of 4 descriptive features as chosen in Q3 c). Generate for both target features a training and test set based on all data rows (for example, consider the sampling strategies as explained in the lecture) of the restricted data set. Briefly explain and motivate the choice of the sampling strategy as well as the size of the training and test set.

In [None]:
# splitting the data into training and test set,

class Q4_b:
    defaults = {"test_size":0.33, "shuffle":True, "random_state":rand_seed}

    def run(self, df, descriptives, green_target, brown_target):
        self.green_set = TTSGroup(*train_test_split(df[descriptives], df[green_target], **Q4_b.defaults))
        self.brown_set = TTSGroup(*train_test_split(df[descriptives], df[brown_target], **Q4_b.defaults))
        return self

q4_b = Q4_b().run(q4_a.encoded_df, q4_a.encoded_inputs, Dataset.Cols.green_frogs, Dataset.Cols.brown_frogs)

Explanation: Here we have split the data into training and test set, where 1/3 of the shuffled data is seperated and used for testing while 2/3 of data split will be used for Training. In the lecture it was advised to use the suffled data since data could be biased when it is stored in dataframe. This removes any effect that is resulted from the way of storing data.

   (c) Use the training set to train 4 different SVMs (2 per frog type) with different parameter combinations. Use at least two distinct values for the parameters *kernel* and *C*.

*Hint: depending on the size of the training data and chosen parameters, training the SVMs may take some time.*

In [None]:
class Q4_c:

    defaults = {"gamma":'auto', "random_state":rand_seed}


    def __init__(self):
        self.greens = list()
        self.browns = list()

    def run(self, green_set, brown_set):
        self.make_svms()
        Q4_c.train_svms_for_target(self.greens, green_set)                 # for green frog  
        Q4_c.train_svms_for_target(self.browns, brown_set)                 # for brown frog
        return self
        
    def train_svms_for_target(svms, tts):
        for svm in svms:
            svm.fit(tts.X.train, tts.y.train)                              # training SVM
        
    def make_svms(self):                                                   # four different SVMs (2 per frog type)
        self.greens.append(SVC(C=1,  kernel='sigmoid', **Q4_c.defaults))
        self.greens.append(SVC(C=10, kernel='linear',  **Q4_c.defaults))
        self.browns.append(SVC(C=5,  kernel='linear',  **Q4_c.defaults))
        self.browns.append(SVC(C=10, kernel='rbf',     **Q4_c.defaults))

q4_c = Q4_c().run(q4_b.green_set, q4_b.brown_set)

   (d) Compute and print the mean accuracy and the classification report of the trained SVMs with respect to the test set (see instruction for examples).

In [None]:
# printing mean accuracy and classification report of the trained SVMs

class Q4_d:

    def __init__(self, green_set, brown_set):
        self.green_set = green_set
        self.brown_set = brown_set

    def run(self, greens, browns):
        Q4_d.run_single("Green Frogs", self.green_set, greens)
        Q4_d.run_single("Brown Frogs", self.brown_set, browns)
        return self

    def run_single(name, data, svms ):
        print('#####################')
        print(name)
        for i, svm in enumerate(svms):
            print('------------')
            print('----Model {}:'.format(i+1))
            pred = svm.predict(data.X.test)
            print('\tAcc: {}'.format(svm.score(data.X.test, data.y.test)))
            print('Classification report:')
            print(metrics.classification_report(data.y.test, pred))

q4_d = Q4_d(q4_b.green_set, q4_b.brown_set).run(q4_c.greens, q4_c.browns)

   (e) Based on the information computed in Q4 so far, interpret and evaluate the 4 SVMs and compare them. Why are they similar/different? Would you recommend using these SVMs and why (not)?

Explanation: From the results presented above (in output), we can observe that the value of accuracy is around 80% for Green Frogs (also linear, rbf kernals record better metrics than poly fit) but its very less for Brown frogs. Further the other metric parameters such as recall, accuracy, f1 score etc.. are better for Green frogs and especially worse to see if Brown frog is absent in the habitat, and the percentage of presence of both green and brown frogs in habitat is good.
But if we compare the classification_report for Regression and SVM, we see that the presence of frogs in both habitats are predicted well by both SVM and Regression, but the absence of Brown frogs are although bad, better represented in Regression.


So for this particular case, we prefer Regression


## Question 5 - Neural Networks (15 points)
In this question consider the sampled_data, which is the dataset that you have created in the *Preprocessing of Dataset* section. The target features are the *different frogs*.

   (a) What are the possible inputs of your network?
   
     - Give the number of possible values of the different categorical inputs.
     - Give the number of possible input patterns for the categorical data..

In [None]:
class Q5_a:

    def run(self, df:pd.DataFrame):
        categorical_columns = [c for c in Dataset.Cols.as_set() if type(c) is CatCol ]     # get all categorical columns
        
        possible_data = lambda col: ', '.join(sorted(df[col].unique()))                    # lambdas to get a string of all 
                                                                                          # unique values and their count
            
        unique_count = lambda col: len(df[col].unique())                                   # column names which are used
        feat_name_col, val_count_col ='feature', 'unique value count'                     # multiple times as variables 
                                                                                         # to avoid typo-confusion 
                                                                                             

        features = pd.DataFrame([[col, unique_count(col), possible_data(col)] for col in categorical_columns], columns=[feat_name_col, val_count_col , 'possible values'])
        features.index = features[feat_name_col]                                           # actually build the dataframe 
        features.drop(feat_name_col, axis=1, inplace = True)                              # containing the unique values

        display('(A) Possible feature values:', features)                                  # print results
        display('(B) Number of possible combinations between all categorical data: {}'.format(func.reduce(operator.mul, features[val_count_col])))
        return self

q5_a = Q5_a().run(Dataset.load_sampled())


Explanation:

**Table (A)** shows all possible values of the 5 categorical values.

**Calculation (B)** shows the possible combinations between all categorical features.



   (b) Choose one categorical feature and two non-categorical features as input features. Create a data set with those features and the target columns (different frogs). Name this data set *NN_data*.

In [None]:
class Q5_b:
    
    input_features = [Dataset.Cols.acessR, Dataset.Cols.typeR, Dataset.Cols.fishingR]                  # Input features choosen
    target_features = [Dataset.Cols.green_frogs, Dataset.Cols.tree_frogs, Dataset.Cols.brown_frogs]    # Target features choosen

    def run(self, df:pd.DataFrame):                                       # creating a dataset with all 
        all_features = Q5_b.input_features.copy()                        # the required features and targets
        for x in Q5_b.target_features: all_features.append(x)
        self.NN_data = df[all_features]
        return self

q5_b = Q5_b().run(Dataset.load_sampled())
NN_data = q5_b.NN_data                                   # taking the final attribute of instance and storing in the variable

   (c) Convert the features that need to be converted using One-Hot-Encoding. Explain why you need (not) to convert these features. Name the data set *NN_data_encoded*.

Explanation:
* Converting all categorical features with one hot encoding.
* All numeric features will be scaled for better learning.


In [None]:
class Q5_c:

    def run(self, df:pd.DataFrame):
        self.NN_data_encoded = pd.get_dummies(df)                        # to perform one-hot encoding
        for (name, vals) in self.NN_data_encoded.iteritems():
            if name in Q5_b.input_features and pd.api.types.is_numeric_dtype(vals.dtype):
                self.NN_data_encoded[name] = StandardScaler().fit_transform(vals.values.reshape(-1,1))
        self.encoded_inputs = [v for v in self.NN_data_encoded.columns.values if v not in Q5_b.target_features]
        return self

q5_c = Q5_c().run(NN_data)
NN_data_encoded = q5_c.NN_data_encoded
display(NN_data_encoded)

   (d) Create a training and test set with 90% of the rows of your *NN_data_encoded* data set for training and 10% as test data set. Name them *train_NN* and *test_NN*

In [None]:
class Q5_d:

    def run(self, df):
        self.train_NN = df.sample(frac=0.9, random_state = rand_seed)      # dividing the whole dataset into
                                                                          # training and testing parts
            
        self.test_NN = df.drop(self.train_NN.index)                        # remove the taining part to obtain test part
        return self

q5_d = Q5_d().run(NN_data_encoded)
train_NN = q5_d.train_NN
test_NN = q5_d.test_NN

   (e) Train two different Neural Networks, one with a linear activation function and one with a non-linear activation function. All other settings stay default. Give the accuracy of each Neural Network for the training and test set (*train_NN* and *test_NN*. Which activation function seems to be better?

In [None]:
class Q5_e:
    
    activation_functions = [("identity", "linear"),              # activation functions
                            ("logistic", "non-linear"), 
                            ("tanh", "non-linear"), 
                            ("relu", "non-linear")]
    
    def run(self, train, test, inp, target):
        # training each activation function and storing all models
        models = [Q5_e.train_model(train, inp, target, activ_func[0]) for activ_func in Q5_e.activation_functions]
        self.models = models
        
        # obtaining the prediction of the neural network for test set of each model 
        self.predicts = [model.predict(test[inp]) for model in models]
        
        # obtaing scores for each model
        acc_scores_test = [MultiClassAccs(test[target], prediction) for model, prediction in zip(models, self.predicts)]
        acc_scores_train = [MultiClassAccs(train[target], model.predict(train[inp])) for model in models]
        self.scores = pd.DataFrame({'activation function':[a[0] for a in Q5_e.activation_functions], 'function type':[a[1] for a in Q5_e.activation_functions], 'score on test data':acc_scores_test, 'score on training data':acc_scores_train}).set_index('activation function')
        return self
     
    # training neural network     
    def train_model(train, inp, target, activ, **params):
        model = MLPClassifier(random_state=rand_seed, activation=activ, **params)
        model.fit(train[inp], train[target])
        return model

q5_e = Q5_e().run(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features)
q5_e.scores

Explanation: RELU and tanh activation functions performed best on the test data, with exactly the same scores. However, tanh performed slightly worse on the training data, which may be an indicator of a reduced tendency to overfit compared to the relu function. 

   (f) Based on your result of (e) train 2 more Neural Networks with different settings (change at least 4 parameters (2 each)). Explain your parameters and the choice of the activation function. Evaluate the different Neural Networks with your test set by giving the accuracy. Try to increase the accuracy and analyse the factors that prohibit better accuracy.

In [None]:
# Note: this class is used to select the best structure of the neural network by running many 
# possibilities (combinations). After obtaining the best neural network structure, we use it in further sections, 
# no need run each time

class BruteForce_HL:

    def __init__(self, train, test, inp, target, activation, lri):
        self.train = train
        self.test = test
        self.inp = inp
        self.target = target
        self.activation = activation
        self.lri = lri

    def run(self, mn, mx, depth, best_acc= 0, top_layers=tuple()):
        best_layers =  None
        acc = lambda x: MultiClassAccs(self.test[self.target], x.predict(self.test[self.inp]))
        for ln in range(mn,mx):
            layers = top_layers + (ln,)
            model = Q5_e.train_model(self.train, self.inp, self.target, self.activation, learning_rate_init=self.lri , hidden_layer_sizes=layers)
            t_acc = acc(model) 
            if t_acc > best_acc or (t_acc == best_acc and best_layers and (len(layers) < len(best_layers) or (len(layers) == len(best_layers) and sum(layers) < sum(best_layers)))):
                best_acc, best_layers = t_acc, layers
                print(best_acc, best_layers)
            if depth > 1:
                o_acc, o_tuple = self.run(mn,mx,depth-1, best_acc, layers)
                if o_tuple and (o_acc > best_acc or (o_acc == best_acc and best_layers and (len(o_tuple) < len(best_layers) or (len(o_tuple) == len(best_layers) and sum(o_tuple) < sum(best_layers))))):
                    best_acc, best_layers = o_acc, o_tuple
                    print(best_acc, best_layers)
        return best_acc, best_layers

#BruteForce_HL(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features).run(15,25,3)


In [None]:
#BruteForce_HL(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features, "tanh", 0.001).run(15,25,3)

In [None]:
#BruteForce_HL(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features, "tanh", 0.0005).run(15,25,8)

In [None]:
#BruteForce_HL(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features, "tanh", 0.002).run(15,18,8)

In [None]:
# Note: this class is used to select the best values of the neural network by running many possibilities (combinations). 
# no need run each time

class BruteForce_Numeric:

    def __init__(self, train, test, inp, target, activation):
        self.train = train
        self.test = test
        self.inp = inp
        self.target = target
        self.activation = activation
    
    def run(self, param_name,rng ):
        best_val =  None
        best_acc = 0
        acc = lambda x: MultiClassAccs(self.test[self.target], x.predict(self.test[self.inp]))
        for param_v in rng:
            model = Q5_e.train_model(self.train, self.inp, self.target, self.activation,hidden_layer_sizes=(20,24,16), **{param_name:param_v})
            new_acc = acc(model)
            if new_acc > best_acc:
                best_acc = new_acc
                best_val = param_v

        return best_val, best_acc

In [None]:
# Note:  this class is used to select the best parameters of the neural network by running many possibilities (combinations). 
# no need run each time

import decimal

def float_range(start, stop, step):
  while start < stop:
    yield start
    start += step

class BruteForce_Params:
    params = [('solver', ['lbfgs', 'sgd', 'adam']), ('learning_rate', ['constant','invscaling','adaptive']), ('learning_rate_init', float_range(0.00001, 0.001, 0.00001))]

    def __init__(self, train, test, inp, target):
        self.train = train
        self.test = test
        self.inp = inp
        self.target = target

    def run(self, activ_functs):
        for activ in activ_functs:
            print("#####################")
            print(activ)
            runner = BruteForce_Numeric(self.train, self.test, self.inp, self.target, activ)
            for p_name, p_rng in BruteForce_Params.params:
                print("Best value for {} is <<{}>> with an accuracy of {}".format(p_name, *runner.run(p_name, p_rng)))

In [None]:
#BruteForce_Params(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features).run(['relu', 'tanh'])

In [None]:
class Q5_f:

    def run(self, train, test, inp, target):
        # final models with choosen parameters, structure, and values are trained 
        self.model1 = Q5_e.train_model(train, inp, target, "tanh", learning_rate_init=0.0005, hidden_layer_sizes=(16,15,22))
        self.model2 = Q5_e.train_model(train, inp, target, "tanh", learning_rate_init=0.002, hidden_layer_sizes=(15, 15, 16, 16, 15, 15, 17, 17))
        
        # predictions of the trained model is obtained
        self.model1_pred = self.model1.predict(test[inp])
        self.model2_pred = self.model2.predict(test[inp])
        
        # scores of various target features are printed
        self.true_vals = test[target]
        for name, predicted_vals in [("Model 1", self.model1_pred), ("Model 2", self.model2_pred)]:
            print("{} combination acc: {}".format(name, metrics.accuracy_score(test[target], predicted_vals)))
            for i, (col,vals) in enumerate(test[target].iteritems()):
                print("\t{} acc: {}".format(col, metrics.accuracy_score(vals, [a[i] for a in predicted_vals])))

        return self

q5_f = Q5_f().run(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features)

In [None]:
# classification report for both models

for pred in (q5_f.model1_pred, q5_f.model2_pred):
    print(metrics.classification_report(q5_f.true_vals, pred, target_names=['Green', 'Tree', 'Brown']))


In [None]:
# For 4 different structures of neural networks and activation functions,   ('relu',(20,24,16)),
#                                                                           ('relu',(100,)), 
#                                                                           ('tanh',(15,18,22)),  
#                                                                           ('tanh',(100,))
# acccuracy of the models (y axis) are plotted against the learning rate (x axis)

class Q5_visualize_score_steps:

    def __init__(self, train, test, inp, target):
        self.train = train
        self.test = test
        self.inp = inp
        self.target = target

    
    def run(self, lfs):
        for activ, layers in lfs:
            self.run_model(activ,activ,layers)

    def run_model(self, name, activ_func, layers):
        scores = list()
        acc = lambda x: MultiClassAccs(self.test[self.target], x.predict(self.test[self.inp]))
        for v in float_range(0.001, 0.1, 0.001):
            m = Q5_e.train_model(self.train, self.inp, self.target, activ_func, learning_rate_init=v, hidden_layer_sizes=layers)
            a = acc(m)
            scores.append(tuple((v,a)))
            
        df = pd.DataFrame({'learning_rate_init':[t[0] for t in scores], 
                           'combined':[t[1].total_combination_acc for t in scores], 
                           'green frogs':[t[1].individual_accs[Dataset.Cols.green_frogs] for t in scores], 
                           'tree frogs':[t[1].individual_accs[Dataset.Cols.tree_frogs] for t in scores], 
                           'brown frogs':[t[1].individual_accs[Dataset.Cols.brown_frogs] for t in scores]})
        
        df.plot(kind='line', x='learning_rate_init', y=['combined', 'green frogs', 'tree frogs', 'brown frogs'])

_ = Q5_visualize_score_steps(train_NN, test_NN, q5_c.encoded_inputs, Q5_b.target_features).run([('relu',(20,24,16)),
                                                                                                ('relu',(100,)), 
                                                                                                ('tanh',(15,18,22)), 
                                                                                                ('tanh',(100,))])

Explanation: We tried to optimize various parameters to get a high acc with tanh function, as it looked like it would perform similiar to the relu function, but with a reduced tendency to overfit the data. We set various parameters and then determined the best number of hidden layers for accuracy. Halving the initial learning rate provided an additional 6% accuracy on green frogs, while not impacting any of the other classes. We could have propably scored an even higher acc with a) more hidden layers and their size and b) searching through parameter values programmaticly in combination with the hidden layers. However, this would cause the search space to explode. However, it is noteable that the performance of the model was mostly impacted by the number of hidden layers, rather then the size of those.

## Question 6 - Evaluation (10 Points)

   (a) Consider two models of question 5 of your choice with the respective datasets (training and test data).

- Create training and test data for 3-fold cross validation.

In [None]:
# creating training and test data for both models

class Q6:
    X = NN_data_encoded[q5_c.encoded_inputs]
    y = NN_data_encoded[Q5_b.target_features]
    tts = TTSGroup(train_NN[q5_c.encoded_inputs], test_NN[q5_c.encoded_inputs], train_NN[Q5_b.target_features], test_NN[Q5_b.target_features])
    models = [("Model 1",q5_f.model1), ("Model 2", q5_f.model2)]


- Print confusion matrices on the training data and the cell-by-cell summation of the results.

In [None]:
# printing confusion matrices

for name, model in Q6.models:
    print('###############')
    print('{}:'.format(name))
    pred = model.predict(Q6.tts.X.train)
    all_conf = list()
    for i, (target, vals) in enumerate(Q6.tts.y.train.iteritems()):
        matrix = metrics.confusion_matrix(vals, [x[i] for x in pred])
        print('Confusion Matrix for {}:\n{}'.format(target, metrics.confusion_matrix(vals, [x[i] for x in pred])))
        all_conf.append(matrix)
    print('Total sum of all targets:\n{}'.format(sum(all_conf)))


- Print the cell-by-cell summation of the confusion matrices on the test data.

In [None]:
# cell-by-cell summation of confusion matrices

for name, model in Q6.models:
    print('###############')
    print('{}:'.format(name))
    pred = model.predict(Q6.tts.X.test)
    all_conf = list()
    for i, (target, vals) in enumerate(Q6.tts.y.test.iteritems()):
        matrix = metrics.confusion_matrix(vals, [x[i] for x in pred])
        all_conf.append(matrix)
    print('Total sum of all targets:\n{}'.format(sum(all_conf)))

- Print the precision, recall and f1-scores on the test data for each fold and model. Give the unaggregated results.

In [None]:
# printing precision, recall, and f1-scores on the test data for each fold and model

class Q6_kfold:
    def run(self, models, X, y):
        for mod_name, model in models:
            print("###################################\nModel {}".format(mod_name))
            kf = KFold(n_splits=3, random_state=rand_seed)                      # split the entire set into three fold data sets
            for i, (train_index, test_index ) in enumerate(kf.split(X)):
                X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
                y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
                model.fit(X_train,y_train)
                pred_values = model.predict(X_test)
                print("----\nFold {}".format(i+1))                              # printing classification report foldwise
                print(metrics.classification_report(y_test, pred_values, target_names=['Green', 'Tree', 'Brown']))

        return self


q6_kfold = Q6_kfold().run(Q6.models, Q6.X, Q6.y)

- Compute accuracy scores on training and test data (give explicitly the result for each fold).

In [None]:
# computing accuracy scores on training data and test data for both models

class Q6_accs:
    def run(self, models, X, y):
        for mod_name, model in models:
            print("###################################\nModel: {}".format(mod_name))
            kf = KFold(n_splits=3, random_state=rand_seed)                      # K fold split (K = 3)
            train_accs = list()
            test_accs = list()
            for i, (train_index, test_index ) in enumerate(kf.split(X)):        # for giving result for each fold explicitly
                X_train , X_test = X.iloc[train_index,:], X.iloc[test_index,:]
                y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
                model.fit(X_train,y_train)
                train_accs.append(MultiClassAccs(y_train, model.predict(X_train)))
                test_accs.append(MultiClassAccs(y_test, model.predict(X_test)))
            
            Q6_accs.print_fold_accs("training", train_accs)
            Q6_accs.print_fold_accs("testing", test_accs)
        return self

    def print_fold_accs(name, fold_accs):
        print("---------")
        print("Accuracy on {} data:".format(name))
        for i, mc_acc in enumerate(fold_accs):
            print("\t Fold {}: {}".format(i+1, mc_acc))
        


q6_accs = Q6_accs().run(Q6.models, Q6.X, Q6.y)

In order to turn numbers into insights, please comment on your findings. Motivate the answers to the following questions using the metrics and the findings in the **questions 2 through 5** of the assignment.

   (b) What is, in your opinion, the best model? Motivate your answer with the findings above.

Explanation: The neural network, while having a good average score, seems to be to unreliable for this case. While trying to optimise one parameter would to get a good score on feature A but impacts at least one of the other target scores negatively. This is why we would choose multiple SVMs (one per target), as they can be optimised truely independently for the rather small provided dataset.

   (c) Does any model suffer from underfitting or overfitting? Motivate your answer with the findings above.

Explanation: As stated in question 2, tree2 suffers from overfitting. The second neural network is also more prone to overfitting, as we see in the second fold, where differences between training and test data accuracies are between 15% and 20%.

## Question 7 - Clustering (8 points): 

(a) For this question, use the extracted data set you created in the preprocessing step (sampled_data). Drop all the columns expect "VegetationR", "UseR", "FishingR", "RoadDistanceR", "BuildingR", "RoadDistanceR", and "PollutionR". Use a dendogram to find the overview of the clusters that you can extract for the remaining columns.

In [None]:
# dropping desired coulmns from sampled_data

class Q7:
    columns = [Dataset.Cols.vegetationR, Dataset.Cols.useR, Dataset.Cols.fishingR, Dataset.Cols.roadDistanceR, Dataset.Cols.buildingR, Dataset.Cols.pollutionR]
    
    def run(self, base_df):
        self.df = base_df[Q7.columns]
        return self

q7 = Q7().run(Dataset.load_sampled())

In [None]:
class Q7_a:

    def run(self, df):
        normed = normalize(df)                                       # normalisation of data column-wise
        self.scaled = pd.DataFrame(normed, columns=df.columns)
        return self

q7_a = Q7_a().run(q7.df)

In [None]:
# dendogram to find the overview of the clusters 

plt.figure(figsize = (20,10))
plt.title("Dendrogram")
plt.xlabel('instances / objects (row indices)')
plt.ylabel('dissimilarity')
_=shc.dendrogram(shc.linkage(q7_a.scaled, method = 'ward'))

(b) What does the vertical and horizontal axis show in the dendogram? Why the distance between the clusters in the dendogram generally decreases, when we go from top to down in the dendogram?

 Explaination: The vertical axis of this dendrogram represents the distance or dissimilarity between either either individual data points/clusters. The horizontal axis represents instances/objects (row indices).
 
When we go from top to down in the dendogram, the distance between the clusters in the dendogram generally decreases since the dissimilarity between the clusters decreases i.e. similar clusters tend to be more close to each other than dissimilar ones. 

(c) Split the diagram at 3 and find the number the clusters at this point.

In [None]:
# splitting dendogram at 3

plt.figure(figsize = (20,10))
plt.title("Dendrogram")
plt.xlabel('instances / objects (row indices)')
plt.ylabel('dissimilarity')
plt.axhline(y = 3, color = 'b', linestyle = '--')
_ = shc.dendrogram(shc.linkage(q7_a.scaled, method='ward'))

Explaination: In the figure above, the dendogram is splitted at 3 and three vertical lines can be observed at the split. 
Therefore, there are THREE clusters at this point.

(d) Using agglomerative clustering with the number of clusters found in the previous section and a scatter diagram, show the discovered cluster for "VegetationR" and "UseR" in different colors. 

In [None]:
# agglomerative clustering with 3 clusters

class Q7_d:

    def run(self, df, xax, yax):
        cluster = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward')
        cluster.fit_predict(df)

        plt.figure(figsize = (15, 7))
        colours = ListedColormap(['r','b','g'])         # colours for clusters
        clusters = ['1', '2', '3']                      # for legend (shows cluster number)
        scatter = plt.scatter(df['VegetationR'], df['UseR'], c = cluster.labels_, cmap = colours)    # scatter plot
        plt.xlabel('VegetationR')
        plt.ylabel('UseR')
        plt.legend(handles = scatter.legend_elements()[0], labels = clusters) 
        plt.show()
        return self

q7_d = Q7_d().run(q7_a.scaled, Dataset.Cols.vegetationR, Dataset.Cols.useR)

Explaination: In the figure above, three clusters can be seen (red, blue, and green) for "VegetationR" and "UseR". 