In [9]:
import matplotlib.pyplot as plt
from Utils.univariate_featureSelection import featureSelectionResults, get_selectedFeatures
from Utils.setupDataset import get_dataset, load
from Utils.feature_vector import get_feature_names
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score, make_scorer
from sklearn import metrics as skmetrics
from Utils.selection_metrics import SelectionMetrics
import pandas as pd
from Utils.lasso_selections import get_LassoSelectionResults
from Utils.univariate_featureSelection import get_metrics, get_selectedFeatures
from sklearn import preprocessing
from Utils.feature_ranking import getTopFeatures, getRFE_ranking
import time
import os.path
from ast import literal_eval



f1 = make_scorer(f1_score)

In [14]:
#still the same
def featureSelectionResults_SVM(trainSet, validationSet, univariate, RFE, lasso, combination, rfb_kernel=True):
    # 1. get the feature rankings
    # 1.1 univariate ranks from file
    if not os.path.exists(combination):
        selections_df = combine_selections(trainSet, univariate, RFE, lasso, rfb_kernel)
        # save results in cvs
        selections_df.to_csv(combination, sep='\t')
    
    selections_df = pd.read_csv(combination, sep='\t', index_col=0)

    X_train = trainSet['data']
    y_train = trainSet['target']
    X_test = validationSet['data']
    y_test = validationSet['target']
       
    selections_df = apply_feature_selection_SVM(selections_df, X_train, y_train, X_test, y_test, rfb_kernel)
    selections_df = selections_df.sort_values('f1', ascending=False)
    
    selections_df = selections_df.reset_index(drop=True)
    selections_df.to_csv(combination, sep='\t')
    return selections_df




def apply_feature_selection_SVM(selections_df, X_train, y_train, X_test, y_test, rfb_kernel):
    
    for i in range(0, len(selections_df)):
        # convert the selection list from string to int list
        sel = literal_eval(selections_df['selection'][i])
        
        X_train_selection = get_selectedFeatures(X_train, sel) #apply feature selection according to current ranking
        X_test_selection = get_selectedFeatures(X_test, sel) #apply feature selection
        
        if rfb_kernel:
            metrics = get_metrics_SVM(X_train_selection, y_train, X_test_selection, 
                                      y_test, selections_df['gamma'][i], selections_df['C'][i])
        else:
            metrics = get_metrics_linearSVM(X_train_selection, y_train, X_test_selection, 
                                      y_test, selections_df['C'][i])
        
        selections_df.iloc[i, :len(metrics)] = metrics
       
    return selections_df



def get_metrics_SVM(X_train, y_train, X_test, y_test, gamma=0.1, C=1.0):
    
    # Scale X:
    scaler = preprocessing.MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # 
    model = SVC(kernel='rbf', C=C, gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)   


    confusion = skmetrics.confusion_matrix(y_test, y_pred)
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = skmetrics.accuracy_score(y_test, y_pred)
    recall = skmetrics.recall_score(y_test,y_pred)

    precision = skmetrics.precision_score(y_test, y_pred)
    f1 = 2*precision*recall/(precision+recall)
    metrics = np.array([ f1, precision, recall, accuracy, TN, FP, FN, TP])
    return metrics


def combine_selections(trainSet, univariate, RFE, lasso, rfb_kernel=True):
    # scale Trainset:
    trainSet["data"] = preprocessing.minmax_scale(trainSet["data"])
    
    col_labels = ["f1", "precision", "recall", "accuracy", "TN", "FP", "FN", "TP", "n", "selection", "source", "gamma", "C"]
    
    uni_selections = getTopFeatures(trainSet, univariate)
    RFE_selections = getRFE_ranking(trainSet, RFE)
       
    selections = np.concatenate((uni_selections, RFE_selections), axis=1)
    
    print "starting with no select at ", time.ctime()
    
    if rfb_kernel:
        gamma, C = compute_SVM_params(trainSet, range(0,68))
        combination = [[0.0, 0.0, 0.0, 0.0, 0,0,0,0, 68, range(0,68), 'no selection', gamma, C]]
    else:
        C = compute_linearSVM_params(trainSet, range(0,68))
        combination = [[0.0, 0.0, 0.0, 0.0, 0,0,0,0, 68, range(0,68), 'no selection', 0.0, C]]
    
    source = ["f-test", "ranksum", "chi2", "mi", "combined", "RFE"]
    
    print "univariate and RFE at", time.ctime()
    for i, sel in enumerate(selections.T):
        print "starting selection ", source[i], " at ", time.ctime()        
        for n in range(1, 68):
            if rfb_kernel:
                gamma, C = compute_SVM_params(trainSet, sel[:n])
                combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, n, sel[:n], source[i], gamma, C])
            else:
                C = compute_linearSVM_params(trainSet, sel[:n])
                combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, n, sel[:n], source[i], 0.0, C])
                
    lasso_selections = load(lasso)
    
    print "now the lasso at", time.ctime()
    for sel in lasso_selections[:-1]:  # exclude the noSelections
        if rfb_kernel:
            gamma, C = compute_SVM_params(trainSet, sel)
            combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, len(sel), sel, "Lasso", gamma, C])
        else: 
            C = compute_linearSVM_params(trainSet, range(0,68))
            combination.append([0.0, 0.0, 0.0, 0.0, 0,0,0,0, 68, range(0,68), 'Lasso', 0.0, C])

    print "done. saving to file... ", time.ctime()
    df = pd.DataFrame(combination, columns = col_labels)
    #make sure all the selections are lists (instead of numpy arrays!!)
    df['selection'] = df['selection'].apply(lambda x: x.tolist() if type(x).__module__ == np.__name__ else x)

    df.to_csv("SVM_params.csv", sep='\t')
    
    return df
    
    
def compute_SVM_params(trainSet, selection): 
    #gamma_range = np.logspace(-7,2,10)
    #C_range = np.logspace(-2,3,6)
    gamma_range = np.logspace(-2,3,6)
    C_range = np.logspace(-2,3,6)

    #C_range = np.logspace(-3,10,14, base=2) #baaaad results!
    
    X_train = get_selectedFeatures(trainSet["data"], selection)
    y = trainSet["target"]
    
    tuning_params = [{'kernel': ['rbf'], 'gamma': gamma_range, 'C':  C_range}]
    f1 = make_scorer(f1_score)
    
    clf = GridSearchCV(SVC(C=1), tuning_params, cv=5, scoring=f1, n_jobs=4)
    clf.fit(X_train, y)
    
    # parameters for best training result
    C = clf.best_params_['C']
    gamma =  clf.best_params_['gamma']

    return gamma, C


#__________ LINEAR SVC____________

def compute_linearSVM_params(trainSet, selection): 
    tuning_linear = [ {'loss': ['hinge'], 'C': [1.0, 20, 100, 200, 1000, 2000]},]
    
    X_train = get_selectedFeatures(trainSet["data"], selection)
    y = trainSet["target"]
    
    f1 = make_scorer(f1_score)
    
    clf = GridSearchCV(LinearSVC(), tuning_linear, cv=5, scoring=f1)
    clf.fit(X_train, y)
    
    # parameters for best training result
    C = clf.best_params_['C']

    return C


def get_metrics_linearSVM(X_train, y_train, X_test, y_test, C=1.0):
    
    # Scale X:
    scaler = preprocessing.MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # 
    model = LinearSVC(C=C, loss='hinge')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)   


    confusion = skmetrics.confusion_matrix(y_test, y_pred)
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = skmetrics.accuracy_score(y_test, y_pred)
    recall = skmetrics.recall_score(y_test,y_pred)

    precision = skmetrics.precision_score(y_test, y_pred)
    f1 = 2*precision*recall/(precision+recall)
    metrics = np.array([ f1, precision, recall, accuracy, TN, FP, FN, TP])
    return metrics

# TODO!
- ich brauche eine SVM RANKS DATAFRAME, wo ich einfach ALLE SELECTIONS zusammenschmeiss
- fürs plotten, da muss ich halt a neue funktion schreiben aber ich kann mir alle datensaetze von zB ranksum holen, dann sortier ich nach n und das sind meine Punkte fuer den Plot, viel intelligenter, ois beinaund

# next step
- Dataframe für SVM erstellen wo alle Selections zusammen sind, mit zusätzliche Spalten: C, gamma
- vielleicht sollte ich gleich auch Spalten machen für die Metrics und daweil halt NaN hineinschreiben oder 0.0 bzw. -1.0

In [7]:

m_dev = get_dataset("Datasets/M_DEV_dataset.pickle")
w_dev = get_dataset("Datasets/W_DEV_dataset.pickle")
m_val = get_dataset("Datasets/M_VAL_dataset.pickle")
w_val = get_dataset("Datasets/W_VAL_dataset.pickle")

m_uni = 'Datasets/M_SELECTIONS_univariate.csv'
w_uni = 'Datasets/W_SELECTIONS_univariate.csv'


m_rfe = 'Datasets/M_SELECTIONS_RFE.csv'
w_rfe = 'Datasets/W_SELECTIONS_RFE.csv'



m_sel_lasso = "Datasets/M_SELECTIONS_lasso.pickle"
w_sel_lasso = "Datasets/W_SELECTIONS_lasso.pickle"


In [5]:
import warnings
warnings.filterwarnings('ignore')
df = combine_selections(m_dev, m_uni, m_rfe, m_sel_lasso)

starting with no select at  Sun May 21 19:45:52 2017
univariate and RFE at Sun May 21 19:45:57 2017
now the lasso at Sun May 21 20:10:19 2017
done. saving to file...  Sun May 21 20:11:33 2017


In [60]:
new_df = featureSelectionResults_SVM(m_dev, m_val, m_uni, m_rfe, m_sel_lasso, "Datasets/M_SVM_params.csv")
new_df

starting with no select at  Mon May 22 17:00:25 2017
univariate and RFE at Mon May 22 17:00:31 2017


Process PoolWorker-6490:
Process PoolWorker-6491:
Process PoolWorker-6492:
Process PoolWorker-6489:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
    self.run()
    self.run()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
  File "/System/Library/Frameworks/Python.frame

KeyboardInterrupt: 

In [55]:
new_df.loc[new_df['C'] == 0.1]

Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
50,0.79562,0.746575,0.851562,0.782946,93.0,37.0,19.0,109.0,4,"[40, 1, 59, 24]",RFE,100.0,0.1
352,0.760148,0.72028,0.804688,0.748062,90.0,40.0,25.0,103.0,12,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11]",RFE,10.0,0.1
368,0.753846,0.742424,0.765625,0.751938,96.0,34.0,30.0,98.0,15,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60...",combined,10.0,0.1
384,0.742188,0.742188,0.742188,0.744186,97.0,33.0,33.0,95.0,13,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60...",combined,10.0,0.1
385,0.738095,0.75,0.726562,0.744186,99.0,31.0,35.0,93.0,11,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59]",combined,10.0,0.1
388,0.736434,0.730769,0.742188,0.736434,95.0,35.0,33.0,95.0,9,"[1, 43, 10, 11, 40, 26, 59, 35, 56]",f-test,10.0,0.1
390,0.735178,0.744,0.726562,0.74031,98.0,32.0,35.0,93.0,12,"[43, 1, 10, 40, 19, 18, 56, 12, 26, 20, 59, 60]",combined,10.0,0.1
391,0.733591,0.725191,0.742188,0.732558,94.0,36.0,33.0,95.0,8,"[1, 43, 10, 11, 40, 26, 59, 35]",f-test,10.0,0.1
392,0.729242,0.677852,0.789062,0.709302,82.0,48.0,27.0,101.0,7,"[43, 1, 10, 62, 60, 40, 35]",mi,10.0,0.1
396,0.722045,0.610811,0.882812,0.662791,58.0,72.0,15.0,113.0,1,[1],f-test,1.0,0.1


In [8]:
selections_df = pd.read_csv("SVM_params.csv", sep='\t', index_col=0)

In [None]:
selections_df

In [59]:
w_df = featureSelectionResults_SVM(w_dev, w_val, w_uni, w_rfe, w_sel_lasso, "Datasets/W_SVM_params.csv")
w_df

starting with no select at  Sun May 21 23:22:54 2017
univariate and RFE at Sun May 21 23:27:32 2017
now the lasso at Mon May 22 16:21:48 2017
done. saving to file...  Mon May 22 16:50:27 2017


Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
0,0.850985,0.922563,0.789714,0.861397,813.0,58.0,184.0,691.0,6,"[60, 4, 61, 24, 0, 62]",f-test,10.0,100.0
1,0.849969,0.915567,0.793143,0.859679,807.0,64.0,181.0,694.0,4,"[60, 4, 61, 24]",f-test,100.0,1.0
2,0.849601,0.917772,0.790857,0.859679,809.0,62.0,183.0,692.0,5,"[60, 4, 61, 24, 0]",f-test,10.0,100.0
3,0.849231,0.920000,0.788571,0.859679,811.0,60.0,185.0,690.0,7,"[60, 4, 61, 24, 0, 62, 33]",f-test,1.0,100.0
4,0.833230,0.910569,0.768000,0.845934,805.0,66.0,203.0,672.0,2,"[60, 24]",RFE,100.0,1000.0
5,0.829487,0.944526,0.739429,0.847652,833.0,38.0,228.0,647.0,3,"[60, 4, 61]",f-test,100.0,100.0
6,0.806283,0.943338,0.704000,0.830470,834.0,37.0,259.0,616.0,3,"[60, 56, 40]",mi,100.0,100.0
7,0.806283,0.943338,0.704000,0.830470,834.0,37.0,259.0,616.0,3,"[60, 56, 40]",ranksum,100.0,100.0
8,0.803170,0.951487,0.694857,0.829324,840.0,31.0,267.0,608.0,1,[60],f-test,100.0,1000.0
9,0.803170,0.951487,0.694857,0.829324,840.0,31.0,267.0,608.0,1,[60],mi,100.0,1000.0


In [5]:
import warnings
warnings.filterwarnings('ignore')
w_df2 = featureSelectionResults_SVM(w_dev, w_val, w_uni, w_rfe, w_sel_lasso, "Datasets/236W_SVM_params.csv")
w_df2

starting with no select at  Mon May 22 21:34:22 2017
univariate and RFE at Mon May 22 21:37:24 2017
starting selection  f-test  at  Mon May 22 21:37:24 2017
starting selection  ranksum  at  Mon May 22 23:22:56 2017
starting selection  chi2  at  Tue May 23 01:02:07 2017
starting selection  mi  at  Tue May 23 02:39:19 2017
starting selection  combined  at  Tue May 23 04:18:53 2017
starting selection  RFE  at  Tue May 23 05:59:48 2017
now the lasso at Tue May 23 07:42:53 2017
done. saving to file...  Tue May 23 07:59:19 2017


Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
0,0.850985,0.922563,0.789714,0.861397,813.0,58.0,184.0,691.0,6,"[60, 4, 61, 24, 0, 62]",f-test,10.0,100.0
1,0.849601,0.917772,0.790857,0.859679,809.0,62.0,183.0,692.0,5,"[60, 4, 61, 24, 0]",f-test,10.0,100.0
2,0.849231,0.920000,0.788571,0.859679,811.0,60.0,185.0,690.0,7,"[60, 4, 61, 24, 0, 62, 33]",f-test,1.0,100.0
3,0.846154,0.892269,0.804571,0.853379,786.0,85.0,171.0,704.0,4,"[60, 4, 61, 24]",f-test,1000.0,1.0
4,0.837096,0.897906,0.784000,0.847079,793.0,78.0,189.0,686.0,2,"[60, 24]",RFE,1000.0,1000.0
5,0.837068,0.890464,0.789714,0.845934,786.0,85.0,184.0,691.0,4,"[60, 56, 40, 24]",mi,1000.0,1000.0
6,0.829487,0.944526,0.739429,0.847652,833.0,38.0,228.0,647.0,3,"[60, 4, 61]",f-test,100.0,100.0
7,0.806283,0.943338,0.704000,0.830470,834.0,37.0,259.0,616.0,2,"[60, 40]",combined,1000.0,10.0
8,0.806283,0.943338,0.704000,0.830470,834.0,37.0,259.0,616.0,1,[40],chi2,1000.0,10.0
9,0.806283,0.943338,0.704000,0.830470,834.0,37.0,259.0,616.0,3,"[60, 56, 40]",ranksum,1000.0,1000.0


In [15]:
import warnings
warnings.filterwarnings('ignore')
m_df2 = featureSelectionResults_SVM(m_dev, m_val, m_uni, m_rfe, m_sel_lasso, "Datasets/M_linearSVM_params.csv", rfb_kernel=False)
m_df2

starting with no select at  Tue May 23 13:15:08 2017
univariate and RFE at Tue May 23 13:15:09 2017
starting selection  f-test  at  Tue May 23 13:15:09 2017
starting selection  ranksum  at  Tue May 23 13:15:55 2017
starting selection  chi2  at  Tue May 23 13:16:51 2017
starting selection  mi  at  Tue May 23 13:17:35 2017
starting selection  combined  at  Tue May 23 13:18:26 2017
starting selection  RFE  at  Tue May 23 13:19:14 2017
now the lasso at Tue May 23 13:19:56 2017
done. saving to file...  Tue May 23 13:20:21 2017


Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
0,0.779783,0.724832,0.843750,0.763566,89.0,41.0,20.0,108.0,26,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11...",RFE,0.0,100.0
1,0.774908,0.734266,0.820312,0.763566,92.0,38.0,23.0,105.0,23,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11...",RFE,0.0,200.0
2,0.772201,0.763359,0.781250,0.771318,99.0,31.0,28.0,100.0,24,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11...",RFE,0.0,20.0
3,0.762590,0.706667,0.828125,0.744186,86.0,44.0,22.0,106.0,25,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11...",RFE,0.0,100.0
4,0.758621,0.744361,0.773438,0.755814,96.0,34.0,29.0,99.0,33,"[40, 1, 59, 24, 46, 34, 54, 42, 61, 56, 58, 11...",RFE,0.0,100.0
5,0.753994,0.637838,0.921875,0.701550,63.0,67.0,10.0,118.0,52,"[43, 1, 10, 62, 60, 40, 35, 19, 24, 26, 56, 12...",mi,0.0,20.0
6,0.742857,0.625668,0.914062,0.686047,60.0,70.0,11.0,117.0,57,"[1, 43, 10, 11, 26, 59, 35, 19, 12, 18, 20, 24...",ranksum,0.0,20.0
7,0.742857,0.625668,0.914062,0.686047,60.0,70.0,11.0,117.0,58,"[1, 43, 10, 11, 26, 59, 35, 19, 12, 18, 20, 24...",ranksum,0.0,20.0
8,0.740214,0.679739,0.812500,0.717054,81.0,49.0,24.0,104.0,46,"[43, 1, 10, 62, 60, 40, 35, 19, 24, 26, 56, 12...",mi,0.0,20.0
9,0.739812,0.617801,0.921875,0.678295,57.0,73.0,10.0,118.0,57,"[43, 10, 1, 18, 20, 19, 40, 34, 8, 16, 3, 56, ...",chi2,0.0,20.0


In [16]:
wl_df2 = featureSelectionResults_SVM(w_dev, w_val, w_uni, w_rfe, w_sel_lasso, "Datasets/W_linearSVM_params.csv", rfb_kernel=False)


starting with no select at  Tue May 23 13:22:22 2017
univariate and RFE at Tue May 23 13:22:29 2017
starting selection  f-test  at  Tue May 23 13:22:29 2017
starting selection  ranksum  at  Tue May 23 13:27:48 2017
starting selection  chi2  at  Tue May 23 13:33:37 2017
starting selection  mi  at  Tue May 23 13:38:44 2017
starting selection  combined  at  Tue May 23 13:44:24 2017
starting selection  RFE  at  Tue May 23 13:49:54 2017
now the lasso at Tue May 23 13:55:09 2017
done. saving to file...  Tue May 23 13:57:03 2017


NameError: name 'ml_df2' is not defined

In [17]:
wl_df2

Unnamed: 0,f1,precision,recall,accuracy,TN,FP,FN,TP,n,selection,source,gamma,C
0,0.845528,0.933702,0.772571,0.858534,823.0,48.0,199.0,676.0,7,"[60, 56, 40, 24, 51, 61, 7]",mi,0.0,1000
1,0.844527,0.926330,0.776000,0.856816,817.0,54.0,196.0,679.0,6,"[60, 4, 61, 24, 0, 62]",f-test,0.0,200
2,0.841772,0.943262,0.760000,0.856816,831.0,40.0,210.0,665.0,7,"[60, 4, 61, 24, 0, 62, 33]",f-test,0.0,200
3,0.841640,0.939437,0.762286,0.856243,828.0,43.0,208.0,667.0,5,"[60, 4, 61, 24, 0]",f-test,0.0,100
4,0.836915,0.936351,0.756571,0.852234,826.0,45.0,213.0,662.0,6,"[60, 56, 40, 24, 51, 61]",mi,0.0,1000
5,0.835459,0.945166,0.748571,0.852234,833.0,38.0,220.0,655.0,4,"[60, 4, 61, 24]",f-test,0.0,200
6,0.830069,0.927966,0.750857,0.845934,820.0,51.0,218.0,657.0,4,"[60, 56, 40, 24]",mi,0.0,1000
7,0.823072,0.950599,0.725714,0.843643,838.0,33.0,240.0,635.0,2,"[60, 24]",RFE,0.0,1000
8,0.821685,0.958841,0.718857,0.843643,844.0,27.0,246.0,629.0,3,"[60, 4, 61]",f-test,0.0,100
9,0.808271,0.894591,0.737143,0.824742,795.0,76.0,230.0,645.0,2,"[40, 34]",chi2,0.0,1000
