In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import zipfile
from tqdm import tqdm_notebook
from glob import glob
from tqdm import tqdm 

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score

import scipy
from scipy import interp, stats
from sklearn import preprocessing
from sklearn.externals.joblib import parallel_backend
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, PolynomialFeatures
from sklearn.metrics import roc_curve, auc, average_precision_score, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import calibration_curve
from sklearn.metrics import f1_score
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier
from sklearn.externals.joblib import Memory
from typing import List, Tuple, Text, TypeVar, Dict, Any
import logging

from sklearn.utils import check_random_state

from tsfresh import extract_features
from tsfresh import select_features
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction.feature_calculators import absolute_sum_of_changes, binned_entropy
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.utilities.dataframe_functions import impute



cachedir = '../cache'
memory = Memory(cachedir=cachedir, verbose=3)
logging.basicConfig(level=logging.DEBUG)

# Read data

In [None]:
train = pd.read_csv('ytrain.csv')
train['train'] = 1

test = pd.read_csv('SampleSubmission.csv')
test['train'] = 0

labels = pd.concat([train, test], ignore_index=True)

In [None]:
labels[labels.Id == 219]
len(labels)

# Baseline features extraction (score: .88)

In [None]:
def calc_features(df):
    max_df = df.max()
    min_df = df.min()
    mean_df = df.mean()
    std_df = df.std()
    var_df = df.var()
    
    return pd.concat([max_df, min_df, mean_df, std_df, var_df], axis=0)

features = {}
frames = []

for archive in 'xtrain.zip', 'xtest.zip':
    with zipfile.ZipFile(archive) as zf:
        for name in tqdm_notebook(zf.namelist()):
            if name.endswith('.csv'):
                
                fname = name.split('/')[-1]
                Id = int(fname.split('.')[0])

                df = pd.read_csv(zf.open(name), header=None)
                frames.append(df)
                features[Id] = calc_features(df)
            
            


In [None]:
features= pd.DataFrame(features).T
features.index.name = 'Id'
features.reset_index(inplace=True)

features.head()

In [None]:
# prepare dataset

frames2 = []

for archive in 'xtrain.zip', 'xtest.zip':
    with zipfile.ZipFile(archive) as zf:
        for name in tqdm_notebook(zf.namelist()):
            if name.endswith('.csv'):
                
                fname = name.split('/')[-1]
                Id = int(fname.split('.')[0])

                df = pd.read_csv(zf.open(name), header=None)
                df['Id'] = Id
                df['label'] = int(labels['Attack'][Id])
                frames2.append(df)





# Generate features with tsfresh

In [None]:
# settings to try for features generation with tsfresh 


hack_settings_small = {
    'length': None,
    'maximum': None,
    'mean': None,
    'median': None,
    'minimum': None,
    'standard_deviation': None,
    'sum_values': None,
    'variance': None,
    'abs_energy': None,
    'absolute_sum_of_changes': None,
    'agg_autocorrelation': [{'f_agg': 'mean'},
        {'f_agg': 'median'},
        {'f_agg': 'var'}],
    'autocorrelation': [
        {'lag': 10},
        {'lag': 50},
        {'lag': 100}],
    'c3': [{'lag': 1}, {'lag': 10}],
    'cid_ce': [{'normalize': True}, {'normalize': False}],
    'count_above_mean': None,
    'count_below_mean': None,
    'fft_aggregated': [{'aggtype': 'centroid'},
        {'aggtype': 'variance'},
        {'aggtype': 'skew'},
        {'aggtype': 'kurtosis'}],
     'fft_coefficient': [{'attr': 'real', 'coeff': 0},
        {'attr': 'real', 'coeff': 1},
        {'attr': 'real', 'coeff': 2},
        {'attr': 'real', 'coeff': 3},
        {'attr': 'real', 'coeff': 4},
        {'attr': 'real', 'coeff': 5},
        ],
    'index_mass_quantile': [{'q': 0.1},
        {'q': 0.2},
        {'q': 0.9}],
    'kurtosis': None,
    'large_standard_deviation': [
        {'r': 0.05},
        {'r': 0.9500000000000001}],
    'last_location_of_maximum': None,
    'last_location_of_minimum': None,
    'length': None,
    'linear_trend': [
        {'attr': 'pvalue'},
        {'attr': 'rvalue'},
        {'attr': 'intercept'},
        {'attr': 'slope'},
        {'attr': 'stderr'}],
    'mean_abs_change': None,
    'mean_change': None,
    'mean_second_derivative_central': None,
    'partial_autocorrelation': [{'lag': 0},
        {'lag': 1},
        {'lag': 2},
        {'lag': 3},
        {'lag': 4},
        {'lag': 5},
    ],
    'quantile': [
        {'q': 0.2},
        {'q': 0.8}],
    'range_count': [{'max': 1, 'min': -1}],
    'skewness': None,
    'standard_deviation': None,
    'variance': None,
    'variance_larger_than_standard_deviation': None}

In [None]:
%%time 

i = 1
hack_features = []

for frame in tqdm(frames2):
    ts = frame.copy().drop('label', axis=1)
    Id = frame['Id'][0]
    ts['time'] = Id
    
    X = extract_features(ts, 
                     column_id='Id', column_sort='time',
                     default_fc_parameters=hack_settings_small,
                     impute_function=impute,
                     chunksize=500
                    )
    
#     X.to_csv('./features/{}.csv'.format(Id))
    hack_features.append(X)
    print(X.shape)
 

In [None]:
hack_f = pd.concat(hack_features, axis=0)
hack_f = hack_f.loc[:, ~hack_f.columns.duplicated()]

print(hack_f.shape)
print(hack_f.index)

hack_f.head()


### Clustering sensors (by correlation)

In [None]:
# collect all ts data in one larde DF 

frames_for_tall = []
for frame in tqdm(frames2):
    ts = frame.copy()
    Id = frame['Id'][0]
    ts['time'] = Id
    frames_for_tall.append(ts)
    
tall_df = pd.concat(frames_for_tall, axis = 0)
y = tall_df[['label']]

print(tall_df.shape)
tall_df.head(3)

In [None]:
df = tall_df.copy().drop(['Id', 'label', 'time'], axis=1)

In [None]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Abalone Feature Correlation')
    labels=['Sex','Length','Diam','Height','Whole','Shucked','Viscera','Shell','Rings',]
    ax1.set_xticklabels(labels,fontsize=6)
    ax1.set_yticklabels(labels,fontsize=6)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
    plt.show()

cm = correlation_matrix(df)

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

plt.figure(figsize=(20,10))

Z=linkage(cm, 'single', 'correlation')
dendrogram(Z, color_threshold=0)


### Get all features dataset

In [None]:
extracted_features = pd.merge(hack_f.copy().reset_index(), labels, how='left', left_on='id', right_on='Id')
extracted_features = extracted_features.T.drop_duplicates().T

print(extracted_features.shape)
extracted_features.head()

In [None]:
[print(col) for col in extracted_features.columns.tolist()]

### Load saved   features

In [None]:
# load 

extracted_features = pd.read_csv('data_features_3251.csv')
extracted_features.shape


### Load features (from Roman)

In [None]:
extracted_features = pd.read_csv('./features_Roman.csv')

print(extracted_features.shape)
extracted_features.head()

#### Get trainset 

In [None]:
# preparing datasets 

def get_trainset(data, xcols=None, testout=False, test_size=0.2):
    """ 
    Get train set from features DF
    """
    
    X_train = None
    y_train = None
    X_dev = None
    y_dev = None
    target = 'Attack'
    
    if xcols is None:
        xcols =  [c for c in data.columns.tolist() if not c in ('Id', 'Attack', 'train')]
        
    xcols = list(set(xcols))
    X_train, y_train = data.loc[data.train == 1, xcols].fillna(0), data.loc[data.train == 1, target]

    if testout is True:
        X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2)

        return X_train, X_dev, y_train, y_dev
    
    return X_train, y_train, X_dev, y_dev

def get_testset(data, xcols=None):
    """ 
    Get train set from features DF
    """
    
    X_test = None
    y_test = None
    target = 'Attack'
    
    if xcols is None:
        xcols =  [c for c in data.columns.tolist() if not c in ('Id', 'Attack', 'train')]
        
    xcols = list(set(xcols))
    X_test, y_test = data.loc[data.train == 0, xcols].fillna(0), data.loc[data.train == 0, target]
    
    return X_test, y_test



In [None]:
# target = 'Attack'

xtrain, ytrain, _, _ = get_trainset(extracted_features)
len(ytrain)

### RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

RF_params_grid = { 
    "n_estimators" : [60, 100, 150],
    "max_depth" : [15, 20],
    "min_samples_leaf" : [1, 2, 4],
    'random_state': [42],
    'class_weight': ['balanced'],
}

In [None]:
def train_ExtraTreesClassifier(X_train, y_train, grid_params=None, n_important_f=20, metric='roc_auc'):
    
    if grid_params is None:
        grid_params = { 
            "n_estimators" : [50, 60, 100],
            "max_depth" : [10, 15],
            "min_samples_leaf" : [1, 3],
            'random_state': [42]
        }
    
    clf = GridSearchCV(estimator=ExtraTreesClassifier(), 
                          param_grid=grid_params, 
                          cv=5, verbose=3, n_jobs=-1).fit(X_train, y_train)

    scores = cross_val_score(clf, X_train, y_train, scoring=metric)
    print('{}: mean - {}, std - {}'.format(metric, scores.mean(), scores.std()))
        
    return clf


def train_RandomForest(X_train, y_train, grid_params=None, n_important_f=20, metric='roc_auc'):
    
    if grid_params is None:
        grid_params = { 
            "n_estimators" : [50, 60, 100, 150],
            "max_depth" : [10, 15],
            "min_samples_leaf" : [1, 3],
            'random_state': [42]
        }
    
    clf = GridSearchCV(estimator=RandomForestClassifier(), 
                          param_grid=grid_params, 
                          cv=5, verbose=3, n_jobs=-1).fit(X_train, y_train)

    scores = cross_val_score(clf, X_train, y_train, scoring=metric)
    print('{}: mean - {}, std - {}'.format(metric, scores.mean(), scores.std()))
        
    return clf

In [None]:
# train RandomForest 

In [None]:
RF_clf = train_RandomForest(StandardScaler().fit_transform(xtrain), 
                            ytrain, grid_params=None, n_important_f=100)

RF_clf.best_params_

In [None]:
importances = RF_clf.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

count = 0
imp_f = []

for f in range(200):
    print('{}) {} - {}'.format(f, xtrain.columns[1:][f], importances[indices[f]]))
    imp_f.append(xtrain.columns[1:][f])

## Features Selection

#### SelectFromModel

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectFromModel

sm_model = SelectFromModel(RF_clf.best_estimator_, prefit=True)
X_small = sm_model.transform(xtrain)
print(X_small.shape)

X_test_small = sm_model.transform(xtest)
print(X_test_small.shape)


In [None]:
# add columns name to selected ndarray of features (i.e. functions to train classifiers works with column names)

#  select xtrain set
X_selected_df = pd.DataFrame(X_small, 
                             columns=[xtrain.columns[i] for i in range(len(xtrain.columns)) if sm_model.get_support()[i]])


print(X_selected_df.shape)
X_selected_df.head()

In [None]:
#  select xtest set

X_test_selected_df = pd.DataFrame(X_test_small, 
                             columns=[xtest.columns[i] for i in range(len(xtest.columns)) if sm_model.get_support()[i]])

print(X_test_selected_df.shape)
X_test_selected_df.head()

In [None]:
# run RF train again with selected features 

RF_clf_small = train_RandomForest(StandardScaler().fit_transform(X_small), 
                                  ytrain, grid_params=None, n_important_f=100)

In [None]:
ExT_clf_small = train_ExtraTreesClassifier(StandardScaler().fit_transform(X_small), 
                                     ytrain, grid_params=None, n_important_f=100)

#### PCA 

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=55)
X_pca = pca.fit_transform(xtrain)

In [None]:
RF_clf_pca = train_RandomForest(StandardScaler().fit_transform(X_pca), 
                                ytrain, grid_params=None, n_important_f=100)

#### FastICA 

In [None]:
from sklearn.decomposition import FastICA

ica = FastICA(n_components=100)
X_ica = ica.fit_transform(xtrain)  # Reconstruct signals

In [None]:
RF_clf_ica = train_RandomForest(X_ica, ytrain, grid_params=None, n_important_f=100)

### GradientBoostingClassifier 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_grid_params = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [4, 16],
    'min_samples_leaf': [1, 3],
    "n_estimators" : [30, 60, 100],
    #'max_features': [1.0, 0.3, 0.1] 
    }

clf_GB = GradientBoostingClassifier()

In [None]:
%%time

# trained with initial features set
clf_GB = GridSearchCV(estimator=clf_GB, 
                      param_grid=gb_grid_params, 
                      cv=5, verbose=3, n_jobs=-1).fit(xtrain, ytrain)

scores = cross_val_score(clf_GB, xtrain, ytrain, scoring='roc_auc')
scores.mean(), scores.std()

In [None]:
%%time

# selected features only 
clf_GB = GridSearchCV(estimator=clf_GB, 
                      param_grid=gb_grid_params, 
                      cv=5, verbose=3, n_jobs=-1).fit(X_small, ytrain)

scores = cross_val_score(clf_GB, X_small, ytrain, scoring='roc_auc')
scores.mean(), scores.std()

###  VotingClassifier with GridSearch (this gave score = .95)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
estimators = [('lr', clf1), ('rf', clf2), ('gnb', clf3)]

eclf = VotingClassifier(estimators=estimators, voting='soft')

params = {'lr__C': [0.01, 0.1, 1.0, 10.0], 
          'lr__penalty': ['l1'],
          'rf__n_estimators': [60, 100, 150],
          "rf__max_depth" : [6, 10, 15],
          "rf__min_samples_leaf" : [1, 3],
          'rf__random_state': [42]
         }

vote_clf = GridSearchCV(estimator=eclf, 
                    param_grid=params, 
                    cv=5, verbose=3, n_jobs=-1).fit(X_small, ytrain)
                   
scores = cross_val_score(vote_clf, 
                         StandardScaler().fit_transform(X_small), 
                         ytrain, scoring='roc_auc')

print('VOTING RESULTS: ', scores.mean(), scores.std())

In [None]:
vote_clf.best_estimator_

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


bc_params = {"base_estimator__max_depth": [15],
          "base_estimator__max_features": [None, "auto"],
          "base_estimator__min_samples_leaf": [1, 3],
          "base_estimator__min_samples_split": [2, 5],
          'bootstrap_features': [False, True],
          'max_features': [0.2, 0.3],
          'max_samples': [0.5, 1.0],
          'n_estimators': [60, 150],
}

bag_clf = GridSearchCV(estimator=BaggingClassifier(RandomForestClassifier()), 
                       param_grid=bc_params, 
                       cv=5, verbose=3, n_jobs=-1).fit(xtrain, ytrain)

scores = cross_val_score(bag_clf, 
                         StandardScaler().fit_transform(X_small), 
                         ytrain, scoring='roc_auc')
print('BAGGING RESULTS: ', scores.mean(), scores.std())


### Ensembles of Classifiers that Operate on Different Feature Subsets

https://rasbt.github.io/mlxtend/user_guide/classifier/EnsembleVoteClassifier/#example-6-ensembles-of-classifiers-that-operate-on-different-feature-subsets

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

columns = xtrain.columns.tolist()

col1= columns[0:1000]
col2= columns[0:1000]
          


pipe1 = make_pipeline(ColumnSelector(cols=range(0, 100)),
                      LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=range(100, 200)),
                      LogisticRegression())

evclf = EnsembleVoteClassifier(clfs=[pipe1, pipe2])

scores = cross_val_score(evclf, xtrain, ytrain, scoring='roc_auc')
print('RESULTS: ', scores.mean(), scores.std())

# Custom crazy ensemble  (gave score = .92 somehow)

In [None]:
# try to get custome ensembe (this gave 0.92 scores)

def get_features_for_sensor(df, sensors):
    """
    Extract features for the cluster of sensors
    """
     
    columns = df.columns.tolist()
    sensor_features = []
    
    for sensor in sensors:
        for col in columns:
            name = str(col)
            if name.endswith('_{}'.format(str(sensor))):
                sensor_features.append(col)
    
    return df.copy()[sensor_features]


def train_ExT_cluster(X, sensors, n, metric):
    """
    Train classifier for a cluster of sensors
    
    n: number of features to select for a model
    """
    df = get_features_for_sensor(X, sensors)
    print(df.shape)
    
    pca = PCA(n_components=n).fit(df)
    X_in = StandardScaler().fit_transform(pca.transform(df))
    print(X_in.shape)

    clf = train_ExtraTreesClassifier(X_in, ytrain, grid_params=None, n_important_f=20, metric=metric
    
    return clf, pca

In [None]:
# clusters of sensors (based on features clustering)

c1 = [6, 10, 23, 29, 38, 47]
c2 = [3, 5, 22, 27, 28, 33, 34, 35,36, 41, 43]
c3 = [1, 4, 17, 44, 45, 49, 54]
c4 = [5, 20, 25, 31, 50]
c5 = [2, 3, 13, 24, 26, 31, 32, 33, 34, 40, 42, 46]
c6 = [0, 7, 9, 11, 12, 15, 17, 21, 35, 53]

In [None]:
# train classifier for each cluster 

c1_ExT_clf, c1_pca = train_ExT_cluster(X_selected_df, c1, 28, 'precision')
c2_ExT_clf, c2_pca = train_ExT_cluster(X_selected_df, c2, 28, 'precision')
c3_ExT_clf, c3_pca = train_ExT_cluster(X_selected_df, c3, 28, 'precision')
c4_ExT_clf, c4_pca = train_ExT_cluster(X_selected_df, c4, 28, 'precision')
c5_ExT_clf, c5_pca = train_ExT_cluster(X_selected_df, c5, 28, 'precision')
c6_ExT_clf, c6_pca = train_ExT_cluster(X_selected_df, c6, 28, 'precision')

In [None]:
# merge cluster classifiers into an Ensemble (as a dict) 

c_ensemble_clf = {
    'c1': {
        'sensors': c1,
        'clf': c1_ExT_clf,
        'pca':  c1_pca
    },
    'c2': {
        'sensors': c2,
        'clf': c2_ExT_clf,
        'pca':  c2_pca
    },
    'c3': {
        'sensors': c3,
        'clf': c3_ExT_clf,
        'pca':  c3_pca
    },
    'c4':{
        'sensors': c4,
        'clf': c4_ExT_clf,
        'pca':  c4_pca
    },
    'c5': {
        'sensors': c5,
        'clf': c5_ExT_clf,
        'pca':  c5_pca
    },
    'c6':{
        'sensors': c6,
        'clf': c6_ExT_clf,
        'pca':  c6_pca
    },
}


In [None]:
def custom_ensemble_clf(data, c_ensemble_clf):
    """
    Get prediction for data, get max scores over clusters individual predictions
    """
    
    results = []
    for member in c_ensemble_clf.keys():
        print(member)
        
        clf = c_ensemble_clf[member]['clf']
        sensors = c_ensemble_clf[member]['sensors']
        pca = c_ensemble_clf[member]['pca']
        
        c_data = pca.transform(get_features_for_sensor(data, sensors))
        print(c_data.shape)
        results.append(clf.predict_proba(c_data))
    
    return np.max( np.array(results), axis=0 )


In [None]:
result = custom_ensemble_clf(X_test_selected_df, c_ensemble_clf)

result

# Predict for test

In [None]:
# get prediction for models, train with all features 

# clf = vote_clf

# data = extracted_features
# xcols =  [c for c in data.columns.tolist() if not c in ('Id', 'Attack', 'train')]

# xtest,  ytest = get_testset(extracted_features)
# print(xtest.shape, ytest.shape)

# clf.fit(xtrain, ytrain)
# prediction = clf.predict_proba(xtest)[:,1]

# sample_submission = pd.read_csv('SampleSubmission.csv')
# sample_submission['Attack'] = prediction
# sample_submission.to_csv('baseline.csv', index=False)

In [None]:
# get predicitons for custome ensemble 

prediction = custom_ensemble_clf(X_test_selected_df, c_ensemble_clf)[:,1]

sample_submission = pd.read_csv('SampleSubmission.csv')
sample_submission['Attack'] = prediction
sample_submission.to_csv('baseline_v5_CrazyCustom.csv', index=False)

prediction