In [14]:
from __future__ import division
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
import numpy as np
import pandas as pd
from scipy import stats
from setupDataset import get_dataset
from feature_vector import get_feature_names



In [15]:
fnames = get_feature_names()

In [5]:
len(fnames)

68

# f-test, chi2, mutual information
Input: X, y  (X... die feature vectors in einem array; y... das Target passend zu X)

NOTE: Mutual information (mi) scores will vary for each function call!!! this influences
<hr>
per Feature:
# Wilcoxon T-Test
Input: good, bad (sprich pro Array die Samples einer Kategorie)

# Correlation Coefficient
Input: feature, target (ergo: alle Samples einer Feature Spalte, das zugehörige Target)

In [16]:
def compute_scores(dataset):

    X = dataset["data"]
    y = dataset["target"]
    
    f_test_scores, _ = f_classif(X, y)
    mi = mutual_info_classif(X, y)   
    chi_scores, _ = chi2(X, y)
    
    data = []
    
    for i in range(len(fnames)):
        
        indices_good = np.where(y == 1)[0]
        indices_bad = np.where(y == 0)[0]
        good = X[:, i][indices_good]
        bad  = X[:, i][indices_bad]
        ranksum, _ = stats.ranksums(good, bad) # wilcoxon test
        
        data.append([i, abs(f_test_scores[i]), abs(ranksum), abs(chi_scores[i]), abs(mi[i])]) 
        
    return np.array(data)



def getTopFeatures(data):
    ranks = np.zeros(shape=(len(data), len(data[0])+1))
    ranks[:,0] = data[:,0]

    
    rankrange = np.arange(1, len(data)+1) # [1,2,3,....67,68]
    sums = np.zeros(len(data)) # for the summation of ranks per score
    
    for i in range(1,5):
        ranks = ranks[data[:,i].argsort()[::-1]] #sort descending by column idx i (e.g. fscore)

        temp_ranks = ranks[:,0] # feature indices sorted by best first for current idx i
        sums += rankrange[ranks[:,0].argsort()]
        ranks = ranks[ranks[:,0].argsort()] # bring back into order by indices
        ranks[:,i] = temp_ranks # remember feature indices sorted by best first for current idx i
        
    ranks[:,-1]  = sums 
    
    ranks = ranks[ranks[:,-1].argsort()] # sort by summations of ranks
    temp_ranks = ranks[:,0]
    ranks = ranks[ranks[:,0].argsort()] # sort again by indices
    ranks[:,-1]  = temp_ranks
    
    return ranks 
   
def __save_topScores(ranks, out):

    data = ranks[:,1:].astype(int)
    col_label = ["f", "ranksum", "chi", "mi", "all"]

    df = pd.DataFrame(data, columns=col_label)
    df.to_csv(out, sep='\t')
    print df
    return df

def __save_scores(data, out):
    row_label = get_feature_names()
    col_label = ["index", "f", "ranksum", "chi", "mi"]
    data = np.array(data)
    df = pd.DataFrame(data, index=row_label, columns=col_label)
    df.to_csv(out, sep='\t')
    return df
    

def save_scores(dataset, out):
    __save_scores(compute_scores(dataset, out))

dataset = get_dataset("dataset.pickle")   
scores = compute_scores(dataset) #, 'scores-dataset09indices.csv')

ranks = getTopFeatures(scores)


# usage
best30_features_fscore = ranks[:,1][:30]
best30_features_ranksum = ranks[:,2][:30]
best30_features_chi = ranks[:,3][:30]
best30_features_mi = ranks[:,4][:30]
best30_features_allscores = ranks[:,-1][:30]


__save_topScores(ranks, "ranktest.csv")

     f  ranksum  chi  mi  all
0    1        1   43  59   43
1   43       43   10  12   10
2   40       10    1  10    1
3   10       26   18  61   19
4   59       11   20  60   40
5   11       35   19  43   56
6   26       12   40  19   12
7   35       59   34   1   20
8   56       19    8  56   59
9   24       62   16  62   61
10  60       24    3  33   18
11  62       20   56  24    2
12   0       18   46  20   60
13  61       60    2  58   26
14   2       40    7   8   24
15  12       56   47  35   62
16  18        2   54  22   35
17  34       52   21  46   11
18  19       23   42  11    0
19  20       61    0  16   46
20  46        9   12  40   34
21  53        0   53  26   47
22  58       21   38  23    8
23  52       55   61   2   52
24  54       51   36  38   16
25  47       13   55  47   55
26  42       33   66  51    7
27   3       50   45  32    3
28  23       48    9  13   53
29   4       32   17  18   54
..  ..      ...  ...  ..  ...
38  13       53   50  53   21
39   7    

Unnamed: 0,f,ranksum,chi,mi,all
0,1,1,43,59,43
1,43,43,10,12,10
2,40,10,1,10,1
3,10,26,18,61,19
4,59,11,20,60,40
5,11,35,19,43,56
6,26,12,40,19,12
7,35,59,34,1,20
8,56,19,8,56,59
9,24,62,16,62,61


 UserWarning: Features [25] are constant.
 Bedeutet dass Feature mit index 25 (m-dash count) konstant ist und daher unbrauchbar. 