In [1]:
from Utils.setupDataset import get_dataset, combine_datasets
from Utils.split_dataset import separateByCategory, get_testSet_validationSet, getTestSets
from Utils.feature_ranking import getTopFeatures
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as skmetrics
from Utils.selection_metrics import SelectionMetrics

In [2]:
m_dataset = get_dataset("Datasets/dataset.pickle")
w_dataset = get_dataset("Datasets/wiki_dataset.pickle")

c_dataset = combine_datasets(m_dataset, w_dataset)


In [None]:


def get_selectedFeatures(X, indices = []):
    """
    Returns the feature vector for the given Comment
    :param X: the data part of the dataset (array of feature vectors)
    :param indices: a list of the (feature column) indices to select
    :return: numpy array of selected features
    """
    # select column indices
    return X[:, indices]

def concat_datasetList(setlist):
    
    #import pdb; pdb.set_trace()
    data = setlist[0]["data"]
    target = setlist[0]["target"]
    
    for s in setlist[1:]:
        data = np.concatenate([data, s["data"]])
        target = np.concatenate([target, s["target"]])
    
    return data, target

def get_train_test_set(setlist, index):
    X_train, y_train = concat_datasetList(setlist[:index]+setlist[index+1:])
    X_test = setlist[index]["data"]
    y_test = setlist[index]["target"]
    
    return X_train, y_train, X_test, y_test

def get_metrics(X_train, y_train, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)   

    confusion = skmetrics.confusion_matrix(y_test, y_pred)
    TP = confusion[1,1]
    TN = confusion[0,0]
    FP = confusion[0,1]
    FN = confusion[1,0]
    accuracy = skmetrics.accuracy_score(y_test, y_pred)
    recall = skmetrics.recall_score(y_test,y_pred)

    precision = skmetrics.precision_score(y_test, y_pred)
    f1 = 2*precision*recall/(precision+recall)
    metrics = np.array([ f1, precision, recall, accuracy, TN, FP, FN, TP])
    return metrics

def get_metrics2ss(X_train, y_train, X_test, y_test):
        model = LogisticRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return compute_metrics(y_test, y_pred)

def metrics_feature_selection(ranks, X_train, y_train, X_test, y_test, n):
    # 1. get feature ranks in an array
    # loop over array of ranks (COLUMNS!)
    metrics_noSelection = get_metrics(X_train, y_train, X_test, y_test)
    metrics = [metrics_noSelection]
    
    
    for rank in ranks.T:  # iterate over transposed array (over the columns)
        #model = LogisticRegression()
        X_train_selection = get_selectedFeatures(X_train, rank[:n]) #apply feature selection according to current ranking
        
        #model.fit(X_train_selection, y_train)
        
        X_test_selection = get_selectedFeatures(X_test, rank[:n]) #apply feature selection
        #y_pred = model.predict(X_test_selection)
    
        metrics_temp = get_metrics(X_train_selection, y_train, X_test_selection, y_test)
        metrics.append(metrics_temp)
  
    # returns a list of metrics for each feature selection list for the given train test chunk
    return metrics




def featureSelectionResults(dataset, file='rank_selections.cvs', n=25):
    # 1. get the feature rankings
    ranks = getTopFeatures(dataset, file)
    # 2. separate into test validation sets    
    good, bad = separateByCategory(dataset)
    # good, bad, validate = get_testSet_validationSet(good, bad) brauch kein validation set momentan, oder?
    sets = getTestSets(good, bad) #  returns a list of k datasets (each  { "data": data, "target": target})
    r_len = len(ranks.T) + 1 # +1 for result without feature selection
    metrics = [np.zeros(8) for i in range(r_len)]
    
    for index, set in enumerate(sets):
        #actually each set is once the testing set, and then the others are used for training
        # 1. the index set is the testing set, 
        X_train, y_train, X_test, y_test = get_train_test_set(sets, index)

        
        metrics_temp = metrics_feature_selection(ranks, X_train, y_train, X_test, y_test, n)
        metrics = [metrics[i]+metrics_temp[i] for i in range(r_len)] #sum up the metrics (compute avg later)

    
    # TODO: now divide all values of the metrics via the number of sets
    set_len = len(sets)
    avg_mask = np.array([set_len, set_len, set_len, set_len, 1, 1, 1, 1])
    metrics = [m/avg_mask for m in metrics] #divide by the number of sets to get the average metrics
    
    sm = SelectionMetrics(metrics, ranks, n)
    return sm


filename = 'rankSelections_combined.csv'
sm = featureSelectionResults(c_dataset, filename, 15)
df = sm.data_frame()

In [None]:
df['selection'][1]

In [None]:
df.sort_values('f1', ascending=False)

In [None]:
df