In [1]:
from operator import itemgetter
import matplotlib.pyplot as plt

def Pareto(data, labels=[], cumPlot=True, limit=1.0, axes=None):
    assert 0.0 <= limit <= 1.0, 'limit must be a positive scalar between 0.0 and 1.0'
    dataArgs=(); data_kw={}; line_args=('g',); line_kw={}; limit_kw={}
    # re-order the data in descending order
    data = list(data)
    n = len(data)
    if n!=len(labels):
        labels = range(n)
    ordered = sorted(zip(data, labels), key=itemgetter(0), reverse=True)
    ordData = [dat for dat, lab in ordered]
    ordLabels = [lab for dat, lab in ordered]
    
    # create the cumulative line data
    line_data = [0.0]*n
    total_data = float(sum(ordData))
    for i, dat in enumerate(ordData):
        if i==0: line_data[i] = dat/total_data
        else: line_data[i] = sum(ordData[:i+1])/total_data

    # determine where the data will be trimmed based on the limit
    ltcount = 0
    for ld in line_data:
        if ld<limit:
            ltcount += 1
    limLoc = range(ltcount+1)
    
    limData = [ordData[i] for i in limLoc]
    limLabels = [ordLabels[i] for i in limLoc]
    limLine = [line_data[i] for i in limLoc]
    
    # if axes is specified, grab it and focus on its parent figure; otherwise create a new figure
    if axes:
        plt.sca(axes)
        ax1 = axes
        fig = plt.gcf()
    else:
        fig = plt.gcf()
        ax1 = plt.gca()
    
    # Create the second axis
    if cumPlot: ax2 = ax1.twinx()
    
    # Plotting
    if 'align' not in data_kw: data_kw['align'] = 'center'
    if 'width' not in data_kw: data_kw['width'] = 0.9
    ax1.bar(limLoc, limData, *dataArgs, **data_kw)
    if cumPlot: ax2.plot(limLoc, [ld*100 for ld in limLine], *line_args, **line_kw)
    ax1.set_xticks(limLoc)
    ax1.set_xlim(-0.5,len(limLoc)-0.5)
    
    # Formatting
    if cumPlot:
        # since the sum-total value is not likely to be one of the tick marks, let's make it the top-most one, regardless of label closeness
        ax1.set_ylim(0, total_data)
        loc = ax1.get_yticks()
        newloc = [loc[i] for i in range(len(loc)) if loc[i]<=total_data]
        newloc += [total_data]
        ax1.set_yticks(newloc)
        ax2.set_ylim(0, 100)
        if limit<1.0:
            xmin,xmax = ax1.get_xlim()
            if 'linestyle' not in limit_kw:
                limit_kw['linestyle'] = '--'
            if 'color' not in limit_kw:
                limit_kw['color'] = 'r'
            ax2.axhline(limit*100, xmin-1, xmax-1, **limit_kw)
    
    # set the x-axis labels
    ax1.set_xticklabels(limLabels)
    
    # adjust the second axis if cumplot=True
    if cumPlot:
        yt = [str(int(it))+r'%' for it in ax2.get_yticks()]
        ax2.set_yticklabels(yt)

    if cumPlot: return fig,ax1,ax2
    else: return fig,ax1

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier


# Feature extraction
def FeatureImportance(X, Y):
    model = ExtraTreesClassifier()
    model.fit(X, Y)
    return model.feature_importances_



# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print('\nFeatures:\n')
print(features[0:5,:])


# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]

Features:

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]
Num Features:  3
Selected Features:  [ True False False False False  True  True False]
Feature Ranking:  [1 2 3 5 6 1 1 4]


