In [39]:
%matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
import numpy as np
import pandas as pd
import scipy
from scipy import stats
from setupDataset import get_dataset
from feature_vector import get_feature_names



In [40]:
fnames = get_feature_names()

In [41]:
len(fnames)

68

# f-test, chi2, mutual information
Input: X, y  (X... die feature vectors in einem array; y... das Target passend zu X)

<hr>
per Feature:
# Wilcoxon T-Test
Input: good, bad (sprich pro Array die Samples einer Kategorie)

# Correlation Coefficient
Input: feature, target (ergo: alle Samples einer Feature Spalte, das zugehörige Target)

In [46]:
def compute_scores(dataset, out):

    X = dataset["data"]
    y = dataset["target"]
    
    f_test_scores, _ = f_classif(X, y)
    mi = mutual_info_classif(X, y)   
    chi_scores, _ = chi2(X, y)

    
    data = []
    
    for i in range(len(fnames)):
        
        indices_good = np.where(y == 1)[0]
        indices_bad = np.where(y == 0)[0]
        good = X[:, i][indices_good]
        bad  = X[:, i][indices_bad]
        ranksum, _ = stats.ranksums(good, bad) # wilcoxon
        
        data.append([i, abs(f_test_scores[i]), abs(ranksum), abs(chi_scores[i]), abs(mi[i])])

    
    row_label = fnames
    #print data
    col_label = ["index", "f", "ranksum", "chi", "mi"]
    data = np.array(data)
    df = pd.DataFrame(data, index=row_label, columns=col_label)
    df.to_csv(out, sep='\t')
    
    return data

def getTopFeatures(data):
    ranks = np.zeros(shape=(len(data), len(data[0])+1))
    ranks[:,0] = data[:,0]

    
    rankrange = np.arange(1, len(data)+1) # [1,2,3,....67,68]
    sums = np.zeros(len(data)) # for the summation of ranks per score
    
    for i in range(1,5):
        ranks = ranks[data[:,i].argsort()[::-1]] #sort descending by column idx i (e.g. fscore)

        temp_ranks = ranks[:,0] # feature indices sorted by best first for current idx i
        sums += rankrange[ranks[:,0].argsort()]
        ranks = ranks[ranks[:,0].argsort()] # bring back into order by indices
        ranks[:,i] = temp_ranks # remember feature indices sorted by best first for current idx i
        
    ranks[:,-1]  = sums 
    
    ranks = ranks[ranks[:,-1].argsort()] # sort by summations of ranks
    temp_ranks = ranks[:,0]
    ranks = ranks[ranks[:,0].argsort()] # sort again by indices
    ranks[:,-1]  = temp_ranks
    
    return ranks 
    
        
dataset = get_dataset("dataset.pickle")   
scores = compute_scores(dataset, 'scores-dataset09indices.csv')

ranks = getTopFeatures(scores)

# usage
best30_features_fscore = ranks[:,1][:30]
best30_features_ranksum = ranks[:,2][:30]
best30_features_chi = ranks[:,3][:30]
best30_features_mi = ranks[:,4][:30]
best30_features_allscores = ranks[:,-1][:30]


 UserWarning: Features [25] are constant.
 Bedeutet dass Feature mit index 25 (m-dash count) konstant ist und daher unbrauchbar. 