In [41]:
import matplotlib.pyplot as plt
from Utils.univariate_featureSelection import featureSelectionResults, get_selectedFeatures
from Utils.setupDataset import get_dataset, load
from Utils.feature_vector import get_feature_names
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
from sklearn import metrics as skmetrics
from Utils.selection_metrics import SelectionMetrics
import pandas as pd
from Utils.lasso_selections import get_LassoSelectionResults
from Utils.univariate_featureSelection import get_metrics, get_selectedFeatures
from sklearn import preprocessing
from Utils.feature_ranking import getTopFeatures, getRFE_ranking
import time
import os.path
from ast import literal_eval


f1 = make_scorer(f1_score)

In [58]:
#still the same
def featureSelectionResults_SVM(trainSet, validationSet, univariate, RFE, lasso, combination):
    # 1. get the feature rankings
    # 1.1 univariate ranks from file
    if not os.path.exists(combination):
        selections_df = combine_selections(trainSet, univariate, RFE, lasso)
        # save results in cvs
        selections_df.to_csv(combination, sep='\t')
    
    selections_df = pd.read_csv(combination, sep='\t', index_col=0)

    X_train = trainSet['data']
    y_train = trainSet['target']
    X_test = validationSet['data']
    y_test = validationSet['target']
       
    selections_df = apply_feature_selection_SVM(selections_df, X_train, y_train, X_test, y_test)
    selections_df = selections_df.sort_values('f1', ascending=False)
    
    selections_df = selections_df.reset_index(drop=True)
    selections_df.to_csv(combination, sep='\t')
    return selections_df




def apply_feature_selection_SVM(selections_df, X_train, y_train, X_test, y_test):
    
    for i in range(0, len(selections_df)):
        # convert the selection list from string to int list
        sel = literal_eval(selections_df['selection'][i])
        
        X_train_selection = get_selectedFeatures(X_train, sel) #apply feature selection according to current ranking
        X_test_selection = get_selectedFeatures(X_test, sel) #apply feature selection
        
        metrics = get_metrics_SVM(X_train_selection, y_train, X_test_selection, 
                                  y_test, selections_df['gamma'][i], selections_df['C'][i])
        
        selections_df.iloc[i, :len(metrics)] = metrics
       
    return selections_df



def get_metrics_SVM(X_train, y_train, X_test, y_test, gamma=0.1, C=1.0):
    
    # Scale X:
    scaler = preprocessing.MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # 
    model = SVC(kernel='rbf', C=C, gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)   


    confusion = skmetrics.confusion_matrix(y_test, y_pred)
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = skmetrics.accuracy_score(y_test, y_pred)
    recall = skmetrics.recall_score(y_test,y_pred)

    precision = skmetrics.precision_score(y_test, y_pred)
    f1 = 2*precision*recall/(precision+recall)
    metrics = np.array([ f1, precision, recall, accuracy, TN, FP, FN, TP])
    return metrics


def combine_selections(trainSet, univariate, RFE, lasso):
    # scale Trainset:
    trainSet["data"] = preprocessing.minmax_scale(trainSet["data"])
    
    col_labels = ["f1", "precision", "recall", "accuracy", "TN", "FP", "FN", "TP", "n", "selection", "source", "gamma", "C"]
    
    uni_selections = getTopFeatures(trainSet, univariate)
    RFE_selections = getRFE_ranking(trainSet, RFE)
       
    selections = np.concatenate((uni_selections, RFE_selections), axis=1)
    
    print "starting with no select at ", time.ctime()
    
    gamma, C = compute_SVM_params(trainSet, range(0,68))
    combination = [[0.0, 0.0, 0.0, 0.0, 0,0,0,0, 68, range(0,68), 'no selection', gamma, C]]
    source = ["f-test", "ranksum", "chi2", "mi", "combined", "RFE"]
    
    print "univariate and RFE at", time.ctime()
    for i, sel in enumerate(selections.T):
                
        for n in range(1, 68):
            gamma, C = compute_SVM_params(trainSet, sel[:n])
            combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, n, sel[:n], source[i], gamma, C])
    
    lasso_selections = load(lasso)
    
    print "now the lasso at", time.ctime()
    for sel in lasso_selections[:-1]:  # exclude the noSelections
        gamma, C = compute_SVM_params(trainSet, sel)
        combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, len(sel), sel, "Lasso", gamma, C])
    

    print "done. saving to file... ", time.ctime()
    df = pd.DataFrame(combination, columns = col_labels)
    #make sure all the selections are lists (instead of numpy arrays!!)
    df['selection'] = df['selection'].apply(lambda x: x.tolist() if type(x).__module__ == np.__name__ else x)

    df.to_csv("SVM_params.csv", sep='\t')
    
    return df
    
    
def compute_SVM_params(trainSet, selection): 
    gamma_range = np.logspace(-7,2,10)
    C_range = np.logspace(-2,3,6)

    #C_range = np.logspace(-3,10,14, base=2) #baaaad results!
    
    X_train = get_selectedFeatures(trainSet["data"], selection)
    y = trainSet["target"]
    
    tuning_params = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C':  C_range}]
    f1 = make_scorer(f1_score)
    
    clf = GridSearchCV(SVC(C=1), tuning_params, cv=5, scoring=f1, n_jobs=4)
    clf.fit(X_train, y)
    
    # parameters for best training result
    C = clf.best_params_['C']
    gamma =  clf.best_params_['gamma']

    return gamma, C

# TODO!
- ich brauche eine SVM RANKS DATAFRAME, wo ich einfach ALLE SELECTIONS zusammenschmeiss
- fürs plotten, da muss ich halt a neue funktion schreiben aber ich kann mir alle datensaetze von zB ranksum holen, dann sortier ich nach n und das sind meine Punkte fuer den Plot, viel intelligenter, ois beinaund

# next step
- Dataframe für SVM erstellen wo alle Selections zusammen sind, mit zusätzliche Spalten: C, gamma
- vielleicht sollte ich gleich auch Spalten machen für die Metrics und daweil halt NaN hineinschreiben oder 0.0 bzw. -1.0

In [15]:

m_dev = get_dataset("Datasets/M_DEV_dataset.pickle")
w_dev = get_dataset("Datasets/W_DEV_dataset.pickle")
m_val = get_dataset("Datasets/M_VAL_dataset.pickle")
w_val = get_dataset("Datasets/W_VAL_dataset.pickle")

m_uni = 'Datasets/M_SELECTIONS_univariate.csv'
w_uni = 'Datasets/W_SELECTIONS_univariate.csv'


m_rfe = 'Datasets/M_SELECTIONS_RFE.csv'
w_rfe = 'Datasets/W_SELECTIONS_RFE.csv'



m_sel_lasso = "Datasets/M_SELECTIONS_lasso.pickle"
w_sel_lasso = "Datasets/W_SELECTIONS_lasso.pickle"


In [5]:
import warnings
warnings.filterwarnings('ignore')
df = combine_selections(m_dev, m_uni, m_rfe, m_sel_lasso)

starting with no select at  Sun May 21 19:45:52 2017
univariate and RFE at Sun May 21 19:45:57 2017
now the lasso at Sun May 21 20:10:19 2017
done. saving to file...  Sun May 21 20:11:33 2017


In [None]:
new_df = featureSelectionResults_SVM(m_dev, m_val, m_uni, m_rfe, m_sel_lasso, "Datasets/M_SVM_params.csv")
new_df

In [36]:
df.to_csv("M_SVM_params.csv", sep='\t')

In [55]:
new_df.loc[new_df['C'] == 0.1]

Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
50,0.79562,0.746575,0.851562,0.782946,93.0,37.0,19.0,109.0,4,"[40, 1, 59, 24]",RFE,100.0,0.1
352,0.760148,0.72028,0.804688,0.748062,90.0,40.0,25.0,103.0,12,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11]",RFE,10.0,0.1
368,0.753846,0.742424,0.765625,0.751938,96.0,34.0,30.0,98.0,15,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60...",combined,10.0,0.1
384,0.742188,0.742188,0.742188,0.744186,97.0,33.0,33.0,95.0,13,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60...",combined,10.0,0.1
385,0.738095,0.75,0.726562,0.744186,99.0,31.0,35.0,93.0,11,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59]",combined,10.0,0.1
388,0.736434,0.730769,0.742188,0.736434,95.0,35.0,33.0,95.0,9,"[1, 43, 10, 11, 40, 26, 59, 35, 56]",f-test,10.0,0.1
390,0.735178,0.744,0.726562,0.74031,98.0,32.0,35.0,93.0,12,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60]",combined,10.0,0.1
391,0.733591,0.725191,0.742188,0.732558,94.0,36.0,33.0,95.0,8,"[1, 43, 10, 11, 40, 26, 59, 35]",f-test,10.0,0.1
392,0.729242,0.677852,0.789062,0.709302,82.0,48.0,27.0,101.0,7,"[43, 1, 10, 62, 60, 40, 35]",mi,10.0,0.1
396,0.722045,0.610811,0.882812,0.662791,58.0,72.0,15.0,113.0,1,[1],f-test,1.0,0.1


In [8]:
selections_df = pd.read_csv("SVM_params.csv", sep='\t', index_col=0)

In [9]:
selections_df

Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
0,0.0,0.0,0.0,0.0,0,0,0,0,68,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",no selection,0.10000,10.0
1,0.0,0.0,0.0,0.0,0,0,0,0,1,[1],f-test,1.00000,0.1
2,0.0,0.0,0.0,0.0,0,0,0,0,2,[ 1 43],f-test,1.00000,0.1
3,0.0,0.0,0.0,0.0,0,0,0,0,3,[ 1 43 10],f-test,0.01000,1.0
4,0.0,0.0,0.0,0.0,0,0,0,0,4,[ 1 43 10 11],f-test,0.10000,0.1
5,0.0,0.0,0.0,0.0,0,0,0,0,5,[ 1 43 10 11 40],f-test,0.00001,1000.0
6,0.0,0.0,0.0,0.0,0,0,0,0,6,[ 1 43 10 11 40 26],f-test,0.10000,1000.0
7,0.0,0.0,0.0,0.0,0,0,0,0,7,[ 1 43 10 11 40 26 59],f-test,10.00000,0.1
8,0.0,0.0,0.0,0.0,0,0,0,0,8,[ 1 43 10 11 40 26 59 35],f-test,10.00000,0.1
9,0.0,0.0,0.0,0.0,0,0,0,0,9,[ 1 43 10 11 40 26 59 35 56],f-test,10.00000,0.1


In [None]:
w_df = featureSelectionResults_SVM(w_dev, w_val, w_uni, w_rfe, w_sel_lasso, "Datasets/W_SVM_params.csv")
w_df

starting with no select at  Sun May 21 23:22:54 2017
univariate and RFE at Sun May 21 23:27:32 2017
