# Generating samples

## Generating data for regression

In [1]:
from sklearn.datasets import make_regression

X, y = make_regression(**{
    'n_samples': 1000, 
    'n_features': 50, 
    'n_informative': 10, 
    'n_targets': 1, 
    'bias': 5.3, 
    'random_state': 37
})

print(f'X shape = {X.shape}, y shape {y.shape}')

X shape = (1000, 50), y shape (1000,)


In [2]:
from sklearn.linear_model import Lasso
from sklearn.metrics import explained_variance_score

lasso = Lasso()
lasso.fit(X, y)

explained_variance_score(y, lasso.predict(X))

0.9997115301602587

## Generating data for classification

In [3]:
from sklearn.datasets import make_classification

X, y = make_classification(**{
    'n_samples': 2000, 
    'n_features': 20, 
    'n_informative': 2, 
    'n_redundant': 2, 
    'n_repeated': 0, 
    'n_classes': 2, 
    'n_clusters_per_class': 2, 
    'random_state': 37
})

print(f'X shape = {X.shape}, y shape {y.shape}')

X shape = (2000, 20), y shape (2000,)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(n_estimators=2)
rf.fit(X, y)

roc_auc_score(y, rf.predict(X))

0.9585334585334585

## Generating data for multilabel classification

In [5]:
from sklearn.datasets import make_multilabel_classification

X, Y = make_multilabel_classification(**{
    'n_samples': 2000, 
    'n_features': 20, 
    'n_classes': 5, 
    'n_labels': 2, 
    'length': 50, 
    'allow_unlabeled': False, 
    'sparse': False, 
    'return_indicator': 'dense', 
    'return_distributions': False, 
    'random_state': 37
})

print(f'X shape = {X.shape}, Y shape {Y.shape}')

X shape = (2000, 20), Y shape (2000, 5)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(n_estimators=2)
rf.fit(X, Y)

roc_auc_score(Y, rf.predict(X))

0.8600929093307099

In [7]:
import numpy as np
from sklearn.metrics import coverage_error, \
    label_ranking_average_precision_score, \
    label_ranking_loss

probs = rf.predict_proba(X)
n_cols = len(probs)
n_rows = probs[0].shape[0]

Y_preds = np.array([[probs[c][r][1] for c in range(n_cols)] for r in range(n_rows)])

for metric in [coverage_error, label_ranking_average_precision_score, label_ranking_loss]:
    print(f'{metric(Y, Y_preds):.5f} : {metric.__name__}')

2.57650 : coverage_error
0.93474 : label_ranking_average_precision_score
0.09792 : label_ranking_loss


## Generating data for clustering

In [8]:
from sklearn.datasets import make_blobs

X, y = make_blobs(**{
    'n_samples': 3000, 
    'n_features': 15, 
    'centers': 3, 
    'cluster_std': 1.0,
    'center_box': (-10.0, 10.0),
    'random_state': 37
})

print(f'X shape = {X.shape}, y shape {y.shape}')

X shape = (3000, 15), y shape (3000,)


In [9]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

km = KMeans(n_clusters=3, random_state=37)
km.fit(X)

silhouette_score(X, km.predict(X))

0.8286949308825501