In [6]:
import pandas as pd
import numpy as np
from numpy.random import RandomState

# data for control group A
samples_A = pd.Series([
     98.24,  97.77,  95.56,  99.49, 101.4 , 105.35,  95.83,  93.02,
    101.37,  95.66,  98.34, 100.75, 104.93,  97.  ,  95.46, 100.03,
    102.34,  98.23,  97.05,  97.76,  98.63,  98.82,  99.51,  99.31,
     98.58,  96.84,  93.71, 101.38, 100.6 , 103.68, 104.78, 101.51,
    100.89, 102.27,  99.87,  94.83,  95.95, 105.2 ,  97.  ,  95.54,
     98.38,  99.81, 103.34, 101.14, 102.19,  94.77,  94.74,  99.56,
    102.  , 100.95, 102.19, 103.75, 103.65,  95.07, 103.53, 100.42,
     98.09,  94.86, 101.47, 103.07, 100.15, 100.32, 100.89, 101.23,
     95.95, 103.69, 100.09,  96.28,  96.11,  97.63,  99.45, 100.81,
    102.18,  94.92,  98.89, 101.48, 101.29,  94.43, 101.55,  95.85,
    100.16,  97.49, 105.17, 104.83, 101.9 , 100.56, 104.91,  94.17,
    103.48, 100.55, 102.66, 100.62,  96.93, 102.67, 101.27,  98.56,
    102.41, 100.69,  99.67, 100.99])

# data for treatment group B
samples_B = pd.Series([
    101.67, 102.27,  97.01, 103.46, 100.76, 101.19,  99.11,  97.59,
    101.01, 101.45,  94.8 , 101.55,  96.38,  99.03, 102.83,  97.32,
     98.25,  97.17, 101.1 , 102.57, 104.59, 105.63,  98.93, 103.87,
     98.48, 101.14, 102.24,  98.55, 105.61, 100.06,  99.  , 102.53,
    101.56, 102.68, 103.26,  96.62,  99.48, 107.6 ,  99.87, 103.58,
    105.05, 105.69,  94.52,  99.51,  99.81,  99.44,  97.35, 102.97,
     99.77,  99.59, 102.12, 104.29,  98.31,  98.83,  96.83,  99.2 ,
     97.88, 102.34, 102.04,  99.88,  99.69, 103.43, 100.71,  92.71,
     99.99,  99.39,  99.19,  99.29, 100.34, 101.08, 100.29,  93.83,
    103.63,  98.88, 105.36, 101.82, 100.86, 100.75,  99.4 ,  95.37,
    107.96,  97.69, 102.17,  99.41,  98.97,  97.96,  98.31,  97.09,
    103.92, 100.98, 102.76,  98.24,  97.  ,  98.99, 103.54,  99.72,
    101.62, 100.62, 102.79, 104.19])

# actual difference between the means in the groups
AB_difference = samples_A.mean() - samples_B.mean()
print("Difference between average purchase amounts:", AB_difference)

alpha = 0.05
    
state = np.random.RandomState(12345)

bootstrap_samples = 1000
count = 0
for i in range(bootstrap_samples):
    # concatenate samples
    united_samples = pd.concat([samples_A, samples_B], axis=0, ignore_index=True)

    # create subsample
    state = RandomState(1995)
    subsample = united_samples.sample(frac=1, replace=True, random_state=state)
    
    # split subsample in half
    subsample_A = subsample[:len(samples_A)]
    subsample_B = subsample[len(samples_A+1):]

    # find the difference between the means
    bootstrap_difference = subsample_A.mean() - subsample_B.mean()
    
    # if the difference is not less than actual difference, add "1" to the counter
    if bootstrap_difference >= AB_difference:
        count += 1

# p-value is equal to the percentage of excess values
pvalue = 1. * count / bootstrap_samples
print('p-value =', pvalue)

if pvalue < alpha:
    print("Reject null hypothesis: average purchase amount is likely to increase")
else:
    print("Failed to reject null hypothesis: average purchase amount is unlikely to increase")

Difference between average purchase amounts: -0.7682000000000357
p-value = 1.0
Failed to reject null hypothesis: average purchase amount is unlikely to increase


In [78]:
import pandas as pd

def revenue(target, probabilities, count):
    probs = probabilities.sort_values(ascending=False)
    result = 0
    probs = pd.concat([probs, target[probs.index]], axis=1).reset_index(drop=True)
    for i in range(count):
        result += int(probs.values[i, 1])
    print(probs)
    return 10 * result

target = pd.Series([1,   1,   0,  0,   1,    0])
probab = pd.Series([0.2, 0.9, 0.8, 0.3, 0.5, 0.1])

res = revenue(target, probab, 2)

print(res)

     0  1
0  0.9  1
1  0.8  0
2  0.5  1
3  0.3  0
4  0.2  1
5  0.1  0
10


In [82]:
import pandas as pd
import numpy as np

# open files
# take '0' index to convert data to pd.Series
target = pd.read_csv('../datasets/eng_target.csv')['0']
probabilities = pd.read_csv('../datasets/eng_probabilites.csv')['0']

def revenue(target, probabilities, count):
    probs_sorted = probabilities.sort_values(ascending=False)
    selected = target[probs_sorted.index][:count]
    return 10 * selected.sum()

state = np.random.RandomState(12345)
    
values = []
for i in range(1000):
    target_subsample = target.sample(frac=0.99, replace=True, random_state=state)
    probs_subsample = probabilities[target_subsample.index]
    values.append( revenue(target_subsample, probs_subsample, 25) )

values = pd.Series(values)
lower = values.quantile(0.01)

mean = values.mean()
print("Average revenue:", mean)
print("1% quantile:", lower)

Average revenue: 212.83
1% quantile: 80.0


In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv('../datasets/heart_labeled.csv')

data['target'] = np.floor(((data['label_1'] + data['label_2'] + data['label_3'])/2)).astype(int)

print(data.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   50    1   0       150   243    0        0      128      0      2.6      1   
1   55    0   1       135   250    0        0      161      0      1.4      1   
2   54    1   0       140   239    0        1      160      0      1.2      2   
3   59    1   3       178   270    0        0      145      0      4.2      0   
4   70    1   2       160   269    0        1      112      1      2.9      1   

   ca  thal  label_1  label_2  label_3  target  
0   0     3        0        0        1       0  
1   0     2        1        1        1       1  
2   0     2        1        0        1       1  
3   0     3        1        0        1       1  
4   1     3        1        0        0       0  


In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('../datasets/heart.csv')
features = data.drop(['target'], axis=1)
target = data['target']

scores = []

# set the block size if there are only three of them
sample_size = int(len(data)/3)

for i in range(0, len(data), sample_size):
    valid_indexes = range(i, i+sample_size)
    train_indexes = [x for x in range(len(data)) if x not in valid_indexes]
    
	# Split variables features and target into samples features_train, target_train, features_valid, target_valid
    target_valid = data.loc[valid_indexes, 'target']
    features_valid = data.loc[valid_indexes, :].drop(['target'], axis=1)
    
    target_train = data.loc[train_indexes, 'target']
    features_train = data.loc[train_indexes, :].drop(['target'], axis=1)

    model = DecisionTreeClassifier(random_state=0)
    model = model.fit(features_train, target_train)
    score = model.score(features_valid, target_valid)
    
    scores.append(score)
 
final_score = np.mean(scores) 
print('Average model quality score:', final_score)

Average model quality score: 0.7689768976897691


In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

data = pd.read_csv('../datasets/heart.csv')
features = data.drop(['target'], axis=1)
target = data['target']

model = DecisionTreeClassifier(random_state=0)

np.mean(cross_val_score(model, features, target, cv=10, scoring='accuracy'))

print('Average model evaluation score:', final_score)

Average model evaluation score: 0.7689768976897691
