# Position and Dispersion

# Position

## Mean, mode and median

### Ungrouped data

In [117]:
import numpy as np
import statistics
from scipy import stats
import pandas as pd
from scipy.stats.mstats import gmean, hmean
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
from tqdm import tqdm

In [3]:
data_ungrouped = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
                  157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
                  163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
                  173])

#### Simple mean

In [4]:
data_ungrouped.sum() / len(data_ungrouped)

160.375

In [5]:
data_ungrouped.mean()

160.375

In [6]:
statistics.mean(data_ungrouped)

160

#### Mode

In [7]:
statistics.mode(data_ungrouped)

160

In [8]:
stats.mode(data_ungrouped)

  stats.mode(data_ungrouped)


ModeResult(mode=array([160]), count=array([5]))

#### Median

In [9]:
index_median = len(data_ungrouped) / 2
 
if index_median % 2 == 0:
    median = (data_ungrouped[int(index_median) -1] + data_ungrouped[int(index_median)]) / 2
else:
    median = data_ungrouped[np.ceil(index_median)]

median

160.0

In [10]:
np.median(data_ungrouped)

160.0

In [11]:
statistics.median(data_ungrouped)

160.0

#### Weighted arithmetic mean

In [12]:
grades = np.array([9, 8, 7, 3])
weights = np.array([1, 2, 3, 4])

In [13]:
weighted_mean = (grades * weights).sum() / weights.sum()
weighted_mean

5.8

In [14]:
np.average(grades, weights=weights)

5.8

### Grouped data

In [15]:
data = {'lower': [150, 154, 158, 162, 166, 170],
         'higher': [154, 158, 162, 166, 170, 174],
         'fi': [5, 9, 11, 7, 5, 3]}

data

{'lower': [150, 154, 158, 162, 166, 170],
 'higher': [154, 158, 162, 166, 170, 174],
 'fi': [5, 9, 11, 7, 5, 3]}

In [16]:
df = pd.DataFrame(data)
df

Unnamed: 0,lower,higher,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


$ xi = \frac{Li + li}{2} $

$ xi = $ midpoint

$ Li = $ higher limit

$ li = $ lower limit

In [17]:
df['xi'] = (df['higher'] + df['lower']) / 2
df

Unnamed: 0,lower,higher,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


$ fi * xi $

In [18]:
df['fi.xi'] = df['fi'] * df['xi']
df

Unnamed: 0,lower,higher,fi,xi,fi.xi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [19]:
freq_accumulated = []
freq_sum = 0
for i, _ in enumerate(df['fi']):
    freq_sum += df['fi'][i]
    freq_accumulated.append(freq_sum)

freq_accumulated

[5, 14, 25, 32, 37, 40]

In [20]:
df['Fi'] = freq_accumulated
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


#### Mean

$ x = \frac{\sum fi * xi}{\sum fi} $

In [21]:
df['fi.xi'].sum(), df['fi'].sum()

(6428.0, 40)

In [22]:
df['fi.xi'].sum() / df['fi.xi'].sum()

1.0

#### Mode

Highest frenquency occurence in the database

In [23]:
df['fi'].max()

11

In [24]:
df[df['fi'] == df['fi'].max()]['xi'].values[0]

160.0

#### Median

$ md = l + \frac{(\frac{\sum fi}{2} - F_{ant}) . h} {fi} $

$ l $ = Lower limit

$ h $ = Range width

In [25]:
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


In [26]:
def get_median(df:pd.DataFrame):
    required_columns = ['lower', 'higher', 'fi', 'xi', 'fi.xi', 'Fi']
    assert set(required_columns).issubset(df.columns)

    md_fi_sum = df['fi'].sum()
    md_idx = df[df['Fi'] >= (md_fi_sum / 2)].iloc[0].name
    md_ant_idx = md_idx - 1

    md_fi = df.loc[md_idx, 'fi']
    md_l = df.loc[md_idx, 'lower']
    md_F_ant = df.loc[md_ant_idx, 'Fi']
    md_h = abs(np.subtract(*df.loc[md_idx, ['lower', 'higher']].values))

    return md_l + ((((md_fi_sum / 2) - md_F_ant) * md_h) / md_fi)

get_median(df)

160.1818181818182

In [27]:
def get_statistic(df):
    mean = df['fi.xi'].sum() / df['fi'].sum()
    mode = df[df['fi'] == df['fi'].max()]['xi'].values[0]
    median = get_median(df)

    return mean, mode, median

get_statistic(df)

(160.7, 160.0, 160.1818181818182)

## Geometric mean, Harmonic mean and Quadratic mean

### Ungrouped data

#### Geometric mean

In [28]:
hmean(data_ungrouped)

160.1647194799467

#### Harmonic mean

In [29]:
hmean(data_ungrouped)

160.1647194799467

#### Quadratic mean

In [30]:
import math

def quadratic_mean(data:list):
    return math.sqrt(sum(n**2 for n in data) / len(data))

quadratic_mean(data=data_ungrouped)

160.48091786876097

## Quartiles

### Ungrouped data

Manual approach

In [31]:
median_idx = math.floor(len(data_ungrouped) / 2)
print(f'50 % : {data_ungrouped[median_idx]}')

50 % : 160


In [32]:
left_list = data_ungrouped[:median_idx]
print(f'25% : {np.median(left_list)}')

25% : 155.5


In [33]:
right_list = data_ungrouped[median_idx:]
print(f'75% : {np.median(right_list)}')

75% : 164.0


Using library

numpy

In [34]:
print(f'50% : {np.quantile(data_ungrouped, q=0.5)}')

50% : 160.0


In [35]:
print(f'25% : {np.quantile(data_ungrouped, q=0.25)}')

25% : 155.75


In [36]:
print(f'75% : {np.quantile(data_ungrouped, q=0.75)}')

75% : 164.0


scipy

In [37]:
print(f'50% : {stats.scoreatpercentile(data_ungrouped, 50)}')

50% : 160.0


In [38]:
print(f'25% : {stats.scoreatpercentile(data_ungrouped, 25)}')


25% : 155.75


In [39]:
print(f'75% : {stats.scoreatpercentile(data_ungrouped, 75)}')

75% : 164.0


pandas

In [40]:
df_data_ungrouped = pd.DataFrame(data_ungrouped)
df_data_ungrouped.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [41]:
df_data_ungrouped.quantile(q=0.25)

0    155.75
Name: 0.25, dtype: float64

In [42]:
df_data_ungrouped.quantile(q=0.50)

0    160.0
Name: 0.5, dtype: float64

In [43]:
df_data_ungrouped.quantile(q=0.75)

0    164.0
Name: 0.75, dtype: float64

In [44]:
df_data_ungrouped.describe()

Unnamed: 0,0
count,40.0
mean,160.375
std,5.903877
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


### Grouped data

In [45]:
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


$ Q1 = l + \frac{(\frac{\sum f_{i}}{4} - F_{ant}) * h}{f_{i}} $

$ Q3 = l + \frac{(\frac{\sum f_{i}}{3} - F_{ant}) * h}{f_{i}} $

$ Q2 $ = **get_median()** function

In [46]:
def get_quantile(df:pd.DataFrame, q1=False):
    required_columns = ['lower', 'higher', 'fi', 'xi', 'fi.xi', 'Fi']
    assert set(required_columns).issubset(df.columns)

    if q1 == True:
        Fi = df['fi'].sum() / 4
    else:
        Fi = (3 * df['fi'].sum()) / 4

    q_idx = df[df['Fi'] >= Fi].iloc[0].name
    q_ant_idx = q_idx - 1

    q = df.loc[q_idx, 'Fi']
    q_ant = df.loc[q_ant_idx, 'Fi']
    l = df.loc[q_idx, 'lower']
    fi = df.loc[q_idx, 'fi']
    h = abs(np.subtract(*df.loc[q_idx, ['lower', 'higher']].values))

    return l + (((Fi - q_ant) * h) / fi)

get_quantile(df, q1=True), get_quantile(df, q1=False)

(156.22222222222223, 164.85714285714286)

## Percentiles

In [47]:
np.median(data_ungrouped)

160.0

In [48]:
np.quantile(data_ungrouped, 0.5)

160.0

In [49]:
np.percentile(data_ungrouped, 50)

160.0

In [50]:
np.percentile(data_ungrouped, [10, 30, 60, 90])

array([152.9, 156. , 161. , 168.1])

In [51]:
df_data_ungrouped.quantile([0.10, 0.30, 0.60, 0.90])

Unnamed: 0,0
0.1,152.9
0.3,156.0
0.6,161.0
0.9,168.1


## Using a csv database to get the statistics from age column

In [52]:
census = pd.read_csv('../Bases de dados/census.csv')
census.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Arithmetic mean

In [53]:
census['age'].sum() / len(census['age'])

38.58164675532078

In [54]:
census['age'].mean()

38.58164675532078

Harmonic mean

In [55]:
hmean(census['age'])

33.91874139089839

Geometric mean

In [56]:
gmean(census['age'])

36.210879158177256

Quadratic mean

In [57]:
quadratic_mean(census['age'].tolist())

40.9218664329987

Median

In [58]:
np.median(census['age'])

37.0

Mode

In [59]:
stats.mode(census['age'])

  stats.mode(census['age'])


ModeResult(mode=array([36]), count=array([898]))

In [60]:
statistics.mode(census['age'])

36

# Dispertion

## Full amplitude

In [61]:
data_ungrouped

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [62]:
print(f'Full amplitude: {data_ungrouped.max() - data_ungrouped.min()}')

Full amplitude: 23


## Interquartile range

In [63]:
q1 = np.quantile(data_ungrouped, q=0.25)
q3 = np.quantile(data_ungrouped, q=0.75)

q1, q3

(155.75, 164.0)

In [64]:
ir = q3 - q1

print(f'Interquartile range: {ir}')

Interquartile range: 8.25


### Detecting outliers through quartiles and interquartile range values

#### Lower outlier 

$ lo = q1 - (1.5 * ir) $

$ lo $ = Lower outlier

$ ir $ = Interquartile range

In [65]:
lo = q1 - (1.5 * ir)

print(f'Lower outlier: {lo}')

Lower outlier: 143.375


#### Higher outlier

$ ho = q1 + (1.5 * ir) $

$ lo $ = Lower outlier

$ ir $ = Interquartile range

In [66]:
ho = q1 + (1.5 * ir)

print(f'Higher outlier: {ho}')

Higher outlier: 168.125


## Variance | ungrouped data

mean

$ \overline{x} = \frac{\sum x_{ì}}{n} $

In [67]:
data_mean = data_ungrouped.sum() / len(data_ungrouped)
data_mean

160.375

variance

$ \sigma = \frac{\sum (x_{i} - \overline{x})^{2}}{N} $

### Manual approach

In [68]:
deviation = abs(data_ungrouped - data_mean)
deviation

array([10.375,  9.375,  8.375,  8.375,  7.375,  6.375,  5.375,  5.375,
        5.375,  5.375,  4.375,  4.375,  4.375,  3.375,  2.375,  2.375,
        0.375,  0.375,  0.375,  0.375,  0.375,  0.625,  0.625,  0.625,
        0.625,  1.625,  2.625,  2.625,  3.625,  3.625,  3.625,  4.625,
        5.625,  6.625,  7.625,  7.625,  8.625,  9.625, 11.625, 12.625])

In [69]:
deviation = deviation ** 2
sum_deviation = deviation.sum()

var = sum_deviation / len(data_ungrouped)
var

33.984375

In [70]:
def variance(data:np.array)->float:
    mean = data.sum() / len(data)
    deviation = abs(data - mean) ** 2
    variance = deviation.sum() / len(data)
    return variance

variance(data_ungrouped)

33.984375

### Using library

In [71]:
np.var(data_ungrouped)

33.984375

## Standart deviation | ungrouped data

$ std = \sqrt{\sigma^{2}} $

$ std $ = standart deviation

### Manual approach

In [72]:
var = variance(data_ungrouped)
var

33.984375

In [73]:
print(f'Standart deviation: {math.sqrt(var)}')

Standart deviation: 5.829611908180509


In [74]:
def std(data:np.array)->float:
    var = variance(data)
    return math.sqrt(var)

std(data_ungrouped)

5.829611908180509

### Using library

In [75]:
np.std(data_ungrouped)

5.829611908180509

## Coefficient of variation

$ cv = \frac{\sigma}{\overline{x}} * 100$

In [76]:
cv = (std(data_ungrouped) / data_ungrouped.mean()) * 100
cv

3.634987939629312

In [77]:
stats.variation(data_ungrouped) * 100

3.634987939629312

## Standart deviation | grouped data

$ sdt = \sqrt{\frac{\sum f_{i} * x_{i}^{2}}{\sum f_{i}} - (\frac{\sum f_{i} * x_{i}}{\sum f_{i}})^{2}} $

Preparing dataset

In [78]:
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5
1,154,158,9,156.0,1404.0,14
2,158,162,11,160.0,1760.0,25
3,162,166,7,164.0,1148.0,32
4,166,170,5,168.0,840.0,37
5,170,174,3,172.0,516.0,40


Adding xi_2 and fi.xi_2 columns

In [79]:
df['xi_2'] = df['xi'] ** 2
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi,xi_2
0,150,154,5,152.0,760.0,5,23104.0
1,154,158,9,156.0,1404.0,14,24336.0
2,158,162,11,160.0,1760.0,25,25600.0
3,162,166,7,164.0,1148.0,32,26896.0
4,166,170,5,168.0,840.0,37,28224.0
5,170,174,3,172.0,516.0,40,29584.0


In [80]:
df['fi.xi_2'] = df['fi'] * df['xi_2']
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,Fi,xi_2,fi.xi_2
0,150,154,5,152.0,760.0,5,23104.0,115520.0
1,154,158,9,156.0,1404.0,14,24336.0,219024.0
2,158,162,11,160.0,1760.0,25,25600.0,281600.0
3,162,166,7,164.0,1148.0,32,26896.0,188272.0
4,166,170,5,168.0,840.0,37,28224.0,141120.0
5,170,174,3,172.0,516.0,40,29584.0,88752.0


Ordering columns

In [81]:
df.columns
ordered_columns = ['lower', 'higher', 'fi', 'xi', 'fi.xi', 'xi_2', 'fi.xi_2', 'Fi']
df = df[ordered_columns]
df

Unnamed: 0,lower,higher,fi,xi,fi.xi,xi_2,fi.xi_2,Fi
0,150,154,5,152.0,760.0,23104.0,115520.0,5
1,154,158,9,156.0,1404.0,24336.0,219024.0,14
2,158,162,11,160.0,1760.0,25600.0,281600.0,25
3,162,166,7,164.0,1148.0,26896.0,188272.0,32
4,166,170,5,168.0,840.0,28224.0,141120.0,37
5,170,174,3,172.0,516.0,29584.0,88752.0,40


Calculating standard deviation using the formula above

In [82]:
def std_grouped_data(df:pd.DataFrame)->float:
    required_columns = ['lower', 'higher', 'fi', 'xi', 'fi.xi', 'xi_2', 'fi.xi_2', 'Fi']
    assert set(required_columns).issubset(df.columns)

    return math.sqrt((df['fi.xi_2'].sum()/df['fi'].sum()) - math.pow(df['fi.xi'].sum() / df['fi'].sum(), 2))

std_grouped_data(df)

5.719265687131764

# Testing some classification algorithm

In [83]:
credit_df = pd.read_csv('../Bases de dados/credit_data.csv')
credit_df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [84]:
credit_df.dropna(inplace=True)
credit_df.shape

(1997, 5)

In [85]:
X = credit_df.iloc[:, 1:4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [86]:
y = credit_df.iloc[:, 4].values
y

array([0, 0, 0, ..., 1, 0, 0])

In [87]:
nb_accuracies = []
rf_accuracies = []
lr_accuracies = []

def train_model(model, X_train, y_train, X_test, y_test):
    m = model()
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    return accuracy_score(y_test, y_pred)

for i in tqdm(range(30)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)

    nb_accuracies.append(train_model(GaussianNB, X_train, y_train, X_test, y_test))
    rf_accuracies.append(train_model(RandomForestClassifier, X_train, y_train, X_test, y_test))
    lr_accuracies.append(train_model(LogisticRegression, X_train, y_train, X_test, y_test))

100%|██████████| 30/30 [00:03<00:00,  7.83it/s]


## Statistic analysis from output accuracy scores

In [88]:
nb_accuracies = np.array(nb_accuracies)
rf_accuracies = np.array(rf_accuracies)
lr_accuracies = np.array(lr_accuracies)

### Mean

In [89]:
nb_accuracies.mean(), rf_accuracies.mean(), lr_accuracies.mean()

(0.92425, 0.9842500000000002, 0.9145)

### Mode

In [90]:
stats.mode(nb_accuracies), stats.mode(rf_accuracies), stats.mode(lr_accuracies)

  stats.mode(nb_accuracies), stats.mode(rf_accuracies), stats.mode(lr_accuracies)


(ModeResult(mode=array([0.9175]), count=array([5])),
 ModeResult(mode=array([0.9825]), count=array([7])),
 ModeResult(mode=array([0.9075]), count=array([4])))

### Median

In [91]:
np.median(nb_accuracies), np.median(rf_accuracies), np.median(lr_accuracies)

(0.925, 0.98375, 0.9125)

### Variance

In [92]:
np.var(nb_accuracies), np.var(rf_accuracies), np.var(lr_accuracies)

(8.756250000000001e-05, 2.839583333333338e-05, 0.00020933333333333337)

In [93]:
np.min([8.756250000000001e-05, 2.920138888888887e-05, 0.00020933333333333337])

2.920138888888887e-05

In [94]:
np.max([8.756250000000001e-05, 2.920138888888887e-05, 0.00020933333333333337])

0.00020933333333333337

### Standard deviation

In [95]:
np.std(nb_accuracies), np.std(rf_accuracies), np.std(lr_accuracies)

(0.00935748363610645, 0.00532877409291606, 0.014468356276140472)

### Coefficient of variation

In [96]:
stats.variation(nb_accuracies) * 100, stats.variation(rf_accuracies) * 100, stats.variation(lr_accuracies) * 100

(1.0124407504578252, 0.5414045306493329, 1.5821056616884057)

## Cross validation with statistic analysis

In [97]:
def train_model_cross_validation(model, X, y, random_state):
    kfold = KFold(n_splits=10, shuffle=True, random_state=random_state)
    m = model()
    return cross_val_score(m, X, y, cv=kfold).mean()

nb_accuracies = []
rf_accuracies = []
lr_accuracies = []

for i in tqdm(range(30)):
    nb_accuracies.append(train_model_cross_validation(GaussianNB, X, y, random_state=i).mean())
    rf_accuracies.append(train_model_cross_validation(RandomForestClassifier, X, y, random_state=i).mean())
    lr_accuracies.append(train_model_cross_validation(LogisticRegression, X, y, random_state=i).mean())

nb_accuracies = np.array(nb_accuracies)
rf_accuracies = np.array(rf_accuracies)
lr_accuracies = np.array(lr_accuracies)

100%|██████████| 30/30 [00:35<00:00,  1.18s/it]


### Mean

In [98]:
nb_accuracies.mean(), rf_accuracies.mean(), lr_accuracies.mean()

(0.9249359296482411, 0.9871649078726967, 0.9137817420435511)

### Mode

In [99]:
stats.mode(nb_accuracies), stats.mode(rf_accuracies), stats.mode(lr_accuracies)

  stats.mode(nb_accuracies), stats.mode(rf_accuracies), stats.mode(lr_accuracies)


(ModeResult(mode=array([0.92487186]), count=array([2])),
 ModeResult(mode=array([0.98648241]), count=array([2])),
 ModeResult(mode=array([0.90984422]), count=array([1])))

### Median

In [100]:
np.median(nb_accuracies), np.median(rf_accuracies), np.median(lr_accuracies)

(0.9248919597989949, 0.9869811557788946, 0.9130979899497487)

### Variance

In [101]:
np.var(nb_accuracies), np.var(rf_accuracies), np.var(lr_accuracies)

(6.387905818876295e-07, 1.9761350653322292e-06, 1.2571044810877418e-05)

In [102]:
np.min([6.387905818876295e-07, 1.3641491937633798e-06, 1.2571044810877418e-05])

6.387905818876295e-07

In [103]:
np.max([6.387905818876295e-07, 1.3641491937633798e-06, 1.2571044810877418e-05])

1.2571044810877418e-05

### Standard deviation

In [104]:
np.std(nb_accuracies), np.std(rf_accuracies), np.std(lr_accuracies)

(0.0007992437562393775, 0.001405750712371233, 0.0035455669237623223)

### Coefficient of variation

In [105]:
stats.variation(nb_accuracies) * 100, stats.variation(rf_accuracies) * 100, stats.variation(lr_accuracies) * 100

(0.08641071566366061, 0.14240282461018322, 0.38801026116292653)

## Atributes selection using variance

In [106]:
base = {
    'a' : np.random.rand(20),
    'b' : np.array([0.5] * 20),
    'target' : np.random.randint(0, 2, size=20)
}

base

{'a': array([0.20592338, 0.56324971, 0.5649898 , 0.92372129, 0.32172094,
        0.92464307, 0.41054838, 0.58772357, 0.79480249, 0.68618412,
        0.50576825, 0.88582357, 0.97763642, 0.048412  , 0.2299179 ,
        0.30933069, 0.67791577, 0.78571979, 0.78557816, 0.48596358]),
 'b': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'target': array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0])}

In [107]:
df_base = pd.DataFrame(base)
df_base.head()

Unnamed: 0,a,b,target
0,0.205923,0.5,1
1,0.56325,0.5,0
2,0.56499,0.5,1
3,0.923721,0.5,1
4,0.321721,0.5,0


In [108]:
df_base.describe()

Unnamed: 0,a,b,target
count,20.0,20.0,20.0
mean,0.583779,0.5,0.5
std,0.268382,0.0,0.512989
min,0.048412,0.5,0.0
25%,0.388342,0.5,0.0
50%,0.576357,0.5,0.5
75%,0.78799,0.5,1.0
max,0.977636,0.5,1.0


In [109]:
np.var(df_base['a']), np.var(df_base['b'])

(0.06842726022408246, 0.0)

In [113]:
X = df_base.iloc[:, :2].values
X

array([[0.20592338, 0.5       ],
       [0.56324971, 0.5       ],
       [0.5649898 , 0.5       ],
       [0.92372129, 0.5       ],
       [0.32172094, 0.5       ],
       [0.92464307, 0.5       ],
       [0.41054838, 0.5       ],
       [0.58772357, 0.5       ],
       [0.79480249, 0.5       ],
       [0.68618412, 0.5       ],
       [0.50576825, 0.5       ],
       [0.88582357, 0.5       ],
       [0.97763642, 0.5       ],
       [0.048412  , 0.5       ],
       [0.2299179 , 0.5       ],
       [0.30933069, 0.5       ],
       [0.67791577, 0.5       ],
       [0.78571979, 0.5       ],
       [0.78557816, 0.5       ],
       [0.48596358, 0.5       ]])

In [125]:
var_selection = VarianceThreshold(threshold=0.06)
X_new = var_selection.fit_transform(X)
X_new, X_new.shape

(array([[0.20592338],
        [0.56324971],
        [0.5649898 ],
        [0.92372129],
        [0.32172094],
        [0.92464307],
        [0.41054838],
        [0.58772357],
        [0.79480249],
        [0.68618412],
        [0.50576825],
        [0.88582357],
        [0.97763642],
        [0.048412  ],
        [0.2299179 ],
        [0.30933069],
        [0.67791577],
        [0.78571979],
        [0.78557816],
        [0.48596358]]),
 (20, 1))

In [126]:
var_selection.variances_

array([0.06842726, 0.        ])

In [128]:
np.where(var_selection.variances_ > 0.06)

(array([0]),)