# Anomaly Detection Challenge 1
## Miguel Sandim and Paula Fortuna

# 1 - Data Reading

### 1.1 - Solve format problems

In a text editor (e. g. sublime) use regex and surround text with "
To match the first one use this (dont forget to remove the one that appears also in the begining of the sentence, and the one in the header):

In [None]:
^[^;]*;[^;]*;[^;]*;

Use this to find the last:

In [None]:
;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*$ 

In [19]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Random libraries and seeds:
import random
random.seed(2)
np.random.seed(2)

# read from csv


train_df = pd.read_csv("data/yelp_data_train.dat", sep = ';', encoding = 'utf-8')
#test_df = pd.read_csv("data/yelp_data_test.dat")
#reviewers_df = pd.read_csv("data/yelp_data_reviewer.dat")
#hotels_df = pd.read_csv("data/yelp_data_hotel.dat")

In [21]:
train_df.head

<bound method NDFrame.head of                       datereviewID              reviewerID  \
9/16/2010                       Ol  nf3q2h-kSQoZK2jBY92FOg   
2/5/2010    i4HIAcNTjabdpG1K4F5Q2g  Sb3DJGdZ4Rq__CqxPbae-g   
8/9/2010            veKKNAaSKWj8os  nR7zLyFOlzAYqmzgJ3DtXg   
8/11/2012   6c-ZiQkHXtp1n6VfiKDQ3g  747lP4p8dUD6RTkcsIaSGg   
7/9/2012    POWQ6FuUf3oe2ZkhmHvciA  Ij5t6VdwtasSkrpp9uAbKg   
6/19/2012   QBynYcLgIgtAd-YfnrrAtA  hSERzClUe57bCw3nCp4plA   
9/14/2012                   ELY3TK  OMm2VcGks3QL0p0n3_kPFw   
3/20/2012   uWKWYb5vDpeDGEAZUc192g  yevHGEUQQmnVlBXIrJ885A   
3/3/2012    hkt7Dnr7kRnLLd9pm-fxDw  Lql1_3zeGlny_Tgq4MI6Fg   
6/18/2012   ZlexD7XvkqH8yve4zCAR7g  RtyDimVdIBwjGdQr0dti1w   
3/13/2012   Rw1JmyRyyjoACCUFvmS9kQ  hCIJT7tIhPX_YZBCPhYhMg   
5/24/2010                O9chyjQi5  nKgjmPhPPiJ8BL97dO76XA   
10/25/2010  x4FHvju16JpVa3ihzIwQvw  IhKctrZ3BtJkfpf0qO-8mQ   
2/13/2011                      NaN  yoB_PYQHjnPjh78ATA0Jgw   
8/9/2011         9CPWfP7Ibj-2TthBN  b8B2

In [None]:
train_df.iloc[:,36].value_counts()

In [None]:
479/(3956 + 479)

## About 10% of our dataset are anomalous cases

# Global Variables 

In [None]:
Y_train = train_df[len(train_df.columns)-1]
X_train = train_df.drop(len(train_df.columns)-1,axis=1)
num_rows_X_train = X_train[0].count()
num_columns_X_train = len(X_train.columns)

# Utils Functions

In [None]:
# From: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

######################################
# Function Save Data To CSV
######################################

def saveDataToCSV(Y_pred):
    id_list = range(1, len(Y_pred)+1)
    submission = pd.DataFrame({
        "Id": id_list,
        "Expected": Y_pred
    })
    submission = submission[['Id', 'Expected']]
    submission.to_csv('submission.csv', index=False)
    return


# 2 - Data Pre-Processing

## 2.1 - Missing Values

In [None]:
import scipy.stats as stats
import numpy.ma as ma
import math

######################################
# Strategies based on constants
######################################

#replace by zero - black colour in RGB
def replaceByZero(df):
    return np.nan_to_num(df)

#replace by 255 - white colour in RGB
def replaceBy255(df):
    return df.fillna(255)

#####################################
# Strategies based on columns values
#####################################

#column minimum
def replaceByColumnMinimum(df):
    return df.fillna(df.min())

#column maximum
def replaceByColumnMaximum(df):
    return df.fillna(df.max())

#column mean
def replaceByColumnMean(df):
    return df.fillna(df.mean())

#column median
def replaceByColumnMedian(df):
    return df.fillna(df.median())

#####################################
# Strategies based on rows values
#####################################

# in the analysis of the rows we have to take into account that each four consecutive rows describe a pixel.
# Each of these four rows stands for: Red, Green, IR, IR. |R|G|IR1|IR2|
# These values refer to different things, and therefore are analysed independently.
# this will allow to consider more the specificities of the problem

# divide the data into four datasets, corresponding to each type of values 

#general function to gather columns for each type |R|G|IR1|IR2| using the mod operator
def separatePixelColumns(position, df):
    indexes = range(0, num_columns_X_train-1)
    indexes = [x for x in indexes if x % 4 == position]
    df_p_attribute = df.iloc[:,indexes]
    return df_p_attribute

#general function to fill missing values based on the rows
#Note that does not make sense to consider the four values |R|G|IR1|IR2|, because they refer to different properties
def fillMissingValuesByRow(df, function):
    for index, row in df.iterrows():
        value_without_nan = function(row)
        nan_positions = row.isnull()
        row[nan_positions] = value_without_nan
    return df

#row spectral mean
def replaceByRowMean(df):
    return fillMissingValuesByRow(df, np.nanmean)

#row spectral median
def replaceByRowMedian(df):
    return fillMissingValuesByRow(df, np.nanmedian)

#row spectral minimum
def replaceByRowMinimum(df):
    return fillMissingValuesByRow(df, np.nanmin)

#row spectral maximum
def replaceByRowMaximum(df):
    return fillMissingValuesByRow(df, np.nanmax)

########################################
# Strategies based on data distribution
########################################

#consider the distribution of the spectral values of each type. 
#Get random value from the spectral values of same type

    
def getRandomNumberFromDataframe(df):
    while True:
        row = df.sample(1, random_state = 2)
        values = row.values[0]
        value = random.choice(values)
        if not math.isnan(value):
            break
    return value
        
def fillMissingValuesWithDistribution(df):
    for index, row in df.iterrows():
        nan_positions = row.isnull()
        for i in range(len(nan_positions)): 
            if nan_positions.iloc[i] == True:
                value = getRandomNumberFromDataframe(df)
                row.iloc[i] = value
    return df

#########################################%%%
# fill Missing Values Considering Spectral
#########################################%%%

def fillMissingValuesBySpectral(df, function):

    #generate four new datasets with the columns of each type
    df_p_attribute_R = separatePixelColumns(0, df)
    df_p_attribute_G = separatePixelColumns(1, df)
    df_p_attribute_IR1 = separatePixelColumns(2, df)
    df_p_attribute_IR2 = separatePixelColumns(3, df)

    #apply function to each of the 4 datasets
    df_p_attribute_R = function(df_p_attribute_R)
    df_p_attribute_G = function(df_p_attribute_G)
    df_p_attribute_IR1 = function(df_p_attribute_IR1)
    df_p_attribute_IR2 = function(df_p_attribute_IR2)

    df = pd.concat(
                        [df_p_attribute_R, 
                         df_p_attribute_G,
                         df_p_attribute_IR1,
                         df_p_attribute_IR2], 
                        axis=1
                        )
    return df

In [None]:
#######################
#   CHAMADAS
#######################

#X_train = replaceByZero(X_train)
#test_df = replaceByZero(test_df)

#X_train = replaceBy255(X_train)
#test_df = replaceBy255(test_df)

#X_train = replaceByColumnMinimum(X_train)
#test_df = replaceByColumnMinimum(test_df)

#X_train = replaceByColumnMaximum(X_train)
#test_df = replaceByColumnMaximum(test_df)

#X_train = replaceByColumnMean(X_train)
#test_df = replaceByColumnMean(test_df)

#X_train = replaceByColumnMedian(X_train)
#test_df = replaceByColumnMedian(test_df)

#X_train = fillMissingValuesBySpectral(X_train, replaceByRowMean)
#test_df = fillMissingValuesBySpectral(test_df, replaceByRowMean)

#X_train = fillMissingValuesBySpectral(X_train, replaceByRowMedian)
#test_df = fillMissingValuesBySpectral(test_df, replaceByRowMedian)

#X_train = fillMissingValuesBySpectral(X_train, replaceByRowMinimum)
#test_df = fillMissingValuesBySpectral(test_df, replaceByRowMinimum)

#X_train = fillMissingValuesBySpectral(X_train, replaceByRowMaximum)
#test_df = fillMissingValuesBySpectral(test_df, replaceByRowMaximum)

#X_train = fillMissingValuesBySpectral(X_train, fillMissingValuesWithDistribution)
#test_df = fillMissingValuesBySpectral(test_df, fillMissingValuesWithDistribution)

datasets = {f_name: {"train": f(X_train.copy()), "test": f(test_df.copy())} for f_name, f in [
        ("01-zero", replaceByZero),
        ("02-255", replaceBy255),
        ("03-col-min", replaceByColumnMinimum),
        ("04-col-max", replaceByColumnMaximum),
        ("05-col-mean", replaceByColumnMean),
        ("06-col-median", replaceByColumnMedian),
        ("07-spec-mean", lambda data: fillMissingValuesBySpectral(data, replaceByRowMean)),
        ("08-spec-median", lambda data: fillMissingValuesBySpectral(data, replaceByRowMedian)),
        ("09-spec-min", lambda data: fillMissingValuesBySpectral(data, replaceByRowMinimum)),
        ("10-spec-max", lambda data: fillMissingValuesBySpectral(data, replaceByRowMaximum)),
        ("11-spec-dis", lambda data: fillMissingValuesBySpectral(data, fillMissingValuesWithDistribution))
    ]}



## 2.2 Normalization

There is no need for normalization in this dataset, since all features are between 0 and 255.

# 3 - Model Selection

- First goal: discover which type of analyses works better
- Second Goal: tune the model

## 3.1 - Model selection based on which models do best in CV using default settings:

In [None]:
#inspired in http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

import sklearn.model_selection as mds
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb

# prepare data

#Y_train = train_df[:,-1]
#X_train = train_df[:,:-1]

#Y_train = train_df[len(train_df.columns)-1]
#X_train = train_df.drop(len(train_df.columns)-1,axis=1)

# prepare configuration for cross validation test harness
num_folds = 10
num_instances = len(X_train)

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
#models.append(('SVM-Linear', SVC(kernel="linear")))
models.append(('SVM-Poly', SVC(kernel="poly")))
models.append(('SVM-RBF', SVC(kernel="rbf")))
models.append(('NN', MLPClassifier(alpha=1))) 
models.append(('RF', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)))
models.append(('AB', AdaBoostClassifier()))
models.append(('XGB', xgb.XGBClassifier()))

# evaluate each model in turn
results = {}
scoring = 'roc_auc' # try with 'roc_auc', f1'

kfold = mds.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2)

for NA_strategy in sorted(datasets.keys()):
    
    results_by_strategy = []
    
    for model_name, model in models:
        cv_results = mds.cross_val_score(model, datasets[NA_strategy]["train"], Y_train, cv=kfold, scoring=scoring)
        results_by_strategy.append({"name": model_name, "cv_results": cv_results, "mean": cv_results.mean(), "std": cv_results.std()})
        #print("%s: %f (%f)" % (model_name, cv_results.mean(), cv_results.std()))
        
    # boxplot algorithm comparison
    fig = plt.figure(figsize=(13, 5), dpi=500)
    fig.suptitle('Algorithm Comparison using \"%s\"' % NA_strategy)
    ax = fig.add_subplot(111)
    plt.boxplot([x["cv_results"] for x in results_by_strategy])
    ax.set_xticklabels([x["name"] for x in results_by_strategy])
    plt.show()
    
    # order the models by the mean auc
    results_by_strategy.sort(key=lambda x: x["mean"], reverse=True)
    print([(x["name"], x["mean"]) for x in results_by_strategy])
    
    results[NA_strategy] = results_by_strategy

### Results sorted by AUC

In [None]:
results_sorted = [(na_method, algorithm["name"], algorithm["mean"]) for na_method in results for algorithm in results[na_method]]
results_sorted.sort(key=lambda x: x[2], reverse=True)
results_sorted

In [None]:
def plotSupervisedAlgorithmsDefault(inf, sup):
    plt.figure(figsize=(13, 7), dpi=500)
    
    # x axis
    labels = [na_method for na_method in results]
    labels.sort()
    plt.xticks(np.arange(len(labels)), labels, rotation='vertical')
    plt.ylim(inf, sup)
    
    # legend:
    algorithm_names = [x["name"] for x in results["01-zero"]] 
    
    [plt.plot([[x["mean"] for x in results[na_method] if x["name"] == alg_name] for na_method in sorted(results)],
              label = alg_name) for alg_name in algorithm_names]
    
    plt.ylabel('AUC')
    plt.xlabel('NA-filling method')
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()
    
plotSupervisedAlgorithmsDefault(0.69, 1)
plotSupervisedAlgorithmsDefault(0.9, 1)
plotSupervisedAlgorithmsDefault(0.99, 1)

### Based on this plot, we decided to tune XGB and LDA and use 07-spec-mean and 09-spec-min

In [None]:
def plotConfusionMatrixBestXGB():
    kfold = mds.StratifiedKFold(n_splits=2, shuffle=True, random_state=2)
    train, test = kfold.split(datasets["07-spec-mean"]["train"], Y_train)

    lol = xgb.XGBClassifier().fit(datasets["07-spec-mean"]["train"].iloc[train[0]], Y_train[train[0]])

    train1_pred = lol.predict(datasets["07-spec-mean"]["train"].iloc[train[1]])

    plot_confusion_matrix(confusion_matrix(Y_train[train[1]], train1_pred, labels = [0, 1]), classes = [0, 1])
    
plotConfusionMatrixBestXGB()

In [None]:
def plotConfusionMatrixBestLDA():
    kfold = mds.StratifiedKFold(n_splits=2, shuffle=True, random_state=2)
    train, test = kfold.split(datasets["09-spec-min"]["train"], Y_train)

    lol = xgb.XGBClassifier().fit(datasets["09-spec-min"]["train"].iloc[train[0]], Y_train[train[0]])

    train1_pred = lol.predict(datasets["09-spec-min"]["train"].iloc[train[1]])

    plot_confusion_matrix(confusion_matrix(Y_train[train[1]], train1_pred, labels = [0, 1]), classes = [0, 1])
    
plotConfusionMatrixBestLDA()

### 3.1.1 - Tuning of the best models
#### Based on this plot, we decided to tune LDA and XGB
### Tuning XGB

In [None]:
# Adapted from https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def modelfit(alg, train_predictors, train_target, useTrainCV=True, cv_folds=10, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train_predictors.values, label=train_target.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train_predictors, train_target, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(train_predictors)
    dtrain_predprob = alg.predict_proba(train_predictors)[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(train_target.values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(train_target, dtrain_predprob))
                    
    #feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')

In [None]:
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

def tuneXGB1():
    xgb1 = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=5, # This should be between 3-10
        min_child_weight=1, # A smaller value is chosen because it is a highly imbalanced class problem
        gamma=0,
        subsample=0.8, # Typical values range between 0.5-0.9.
        colsample_bytree=0.8, # Typical values range between 0.5-0.9.
        objective= 'binary:logistic',
        #nthread=4,
        scale_pos_weight=1, # Because of high class imbalance
        seed=2)
    
    modelfit(xgb1, datasets["07-spec-mean"]["train"], Y_train)
    
tuneXGB1()

#### This last result seems too good to be truth?!

In [None]:
def tuneXGB2():
    param_test1 = {
        'max_depth': np.arange(3,10,2),
        'min_child_weight': np.arange(1,6,2)
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=5,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB2()

#### {'max_depth': 7, 'min_child_weight': 1}

In [None]:
def tuneXGB3():
    param_test1 = {
        'max_depth': [6,7,8],
        'min_child_weight': [1,2,3]
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=5,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB3()

In [None]:
def tuneXGB4():
    param_test1 = {
        'gamma':[i/10.0 for i in np.arange(0,5)]
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=7,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB4()

#### gamma = 0.0

In [None]:
def tuneXGB5():
    xgb1 = xgb.XGBClassifier(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=7, # This should be between 3-10
        min_child_weight=1, # A smaller value is chosen because it is a highly imbalanced class problem
        gamma=0,
        subsample=0.8, # Typical values range between 0.5-0.9.
        colsample_bytree=0.8, # Typical values range between 0.5-0.9.
        objective= 'binary:logistic',
        #nthread=4,
        scale_pos_weight=1, # Because of high class imbalance
        seed=2)
    
    modelfit(xgb1, datasets["07-spec-mean"]["train"], Y_train)
    
tuneXGB5()

In [None]:
def tuneXGB6():
    param_test1 = {
     'subsample': np.arange(0.6, 1.0, 0.1),
     'colsample_bytree': np.arange(0.6, 1.0, 0.1)
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=7,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB6()

#### subsample = 0.6 and colsample_bytree = 0.9

In [None]:
def tuneXGB7():
    param_test1 = {
     'subsample': np.arange(0.55, 0.7, 0.05),
     'colsample_bytree': np.arange(0.85, 1.0, 0.05)
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=7,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB7()

#### better tuned: subsample=0.55 and colsample_bytree=0.85

In [None]:
def tuneXGB8():
    param_test1 = {
     'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }
    
    gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=156, max_depth=7,
                                                      min_child_weight=1, gamma=0, subsample=0.55, colsample_bytree=0.85,
                                                      objective= 'binary:logistic', scale_pos_weight=1, seed=2), 
                            param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)
    
    gsearch1.fit(datasets["07-spec-mean"]["train"], Y_train)
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
tuneXGB8()

#### Reg alpha = 1e-5

In [None]:
def tuneXGB9():    
    xgb1 = xgb.XGBClassifier( learning_rate=0.01, n_estimators=5000, max_depth=7,
                      min_child_weight=1, gamma=0, subsample=0.55, colsample_bytree=0.85,
                      reg_alpha=1e-5, objective= 'binary:logistic', scale_pos_weight=1, seed=2)
    
    modelfit(xgb1, datasets["07-spec-mean"]["train"], Y_train)
    
tuneXGB9()

### Tuning LDA

In [None]:
def tuneLDA():
    param_test1 = [{"solver": ["svd"], "n_components": np.arange(1,len(X_train.columns) - 1)},
                   {"solver": ["lsqr", "eigen"], "n_components": np.arange(1,len(X_train.columns) - 1), "shrinkage": ["auto"]}]
        
    gsearch1 = GridSearchCV(estimator=LinearDiscriminantAnalysis(), 
                            param_grid = param_test1, scoring='roc_auc', cv=10)
    
    fit = gsearch1.fit(datasets["09-spec-min"]["train"], Y_train)
    return(fit)
    
bestLDAfit = tuneLDA()

In [None]:
bestLDAfit.best_params_

In [None]:
bestLDAfit.best_score_

In [None]:
def evaluatingBestLDA():
    kfold = mds.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2)
    model = LinearDiscriminantAnalysis(n_components = 1, shrinkage = "auto", solver="lsqr")
    
    cv_results = mds.cross_val_score(model, datasets["09-spec-min"]["train"], Y_train, cv=kfold, scoring="roc_auc")
    print(cv_results.mean())
    
evaluatingBestLDA()

## 3.2 - Unsupervised Anomaly Detection Methods
We decided to try LOF and see how it goes. We used our implementation from HW1

In [None]:
import lof_pal as lof

In [None]:
def makePredictonsLOF():
    outliers = []
    
    kfold = mds.StratifiedKFold(n_splits=2, shuffle=True, random_state=2)
    sets,_ = kfold.split(datasets["07-spec-mean"]["train"], Y_train)                                      
    
    # Train with only positive examples:
    l = lof.LOF(datasets["07-spec-mean"]["train"].iloc[sets[0]][Y_train[sets[0]] != 1], 3)
    
    Y_pred = [1 if x > 1.2 else 0 for x in l.predict(datasets["07-spec-mean"]["train"].iloc[sets[1]])]
        
    plot_confusion_matrix(confusion_matrix(Y_train[sets[1]], Y_pred, labels = [0, 1]), classes = [0, 1])
    #return Y_pred
        
    
makePredictonsLOF()

# 4 - Submission to Kaggle

In [None]:
# choose algorithm
def makeSubmissionKaggle(NA_strategy, algorithm):
    print("Submiting using \"%s\"" % (NA_strategy))

    algorithm.fit(datasets[NA_strategy]["train"], Y_train)
    Y_pred = algorithm.predict(datasets[NA_strategy]["test"])
    Y_pred = Y_pred.astype(int)

    # save data to CSV
    saveDataToCSV(Y_pred)
    
#makeSubmissionKaggle("07-spec-mean", xgb.XGBClassifier( learning_rate=0.01, n_estimators=5000, max_depth=7,
#                      min_child_weight=1, gamma=0, subsample=0.55, colsample_bytree=0.85,
#                      reg_alpha=1e-5, objective= 'binary:logistic', scale_pos_weight=1, seed=2))

#makeSubmissionKaggle("09-spec-min", LinearDiscriminantAnalysis(n_components = 1, shrinkage = "auto", solver="lsqr"))
makeSubmissionKaggle("10-spec-max", AdaBoostClassifier())