In [1]:
from sklearn import datasets

cancer_dataset = datasets.load_breast_cancer()
print(
    f"Data shape -> {cancer_dataset.data.shape} - "
    f"target shape -> {cancer_dataset.target.shape}"
)

Data shape -> (569, 30) - target shape -> (569,)


In [2]:
from sklearn.datasets import make_regression

X, y = make_regression(
    n_samples=1500,  # how many data samples
    n_features=10,  # how many features per row
    n_informative=7,  # how many are really influencing target
    bias=150.0,  # bias in underlying model
    noise=35.0,  # add some noise to scatter data
    shuffle=True,  # randomly shuffle data
    random_state=123,  # the seed for randomizer
)
print(f"X.shape: {X.shape} - y.shape: {y.shape}")
# display first 3 rows
print(f"X -> {X[:3]}\ny -> {y[:3]}")

X.shape: (1500, 10) - y.shape: (1500,)
X -> [[ 2.34935569  1.9412493   1.93282058 -0.1412172  -2.67449029  1.44096288
   0.05552553  0.69815678 -0.07810129  0.56583355]
 [-0.66962298  0.69421567  0.60554621 -0.97593655  1.43355657 -1.41702087
   0.16956646 -1.46890673 -1.98606025 -0.26890822]
 [ 1.63036469  0.25957123 -0.11459239 -0.33095209  0.89533409 -1.64769326
  -1.09217036  1.03439292  1.49681839 -0.31563242]]
y -> [333.81156804  55.81358322  84.41010317]


In [3]:
# @see: https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original
from ucimlrepo import fetch_ucirepo

dataset = fetch_ucirepo(id=15)
X, y = dataset.data.features, dataset.data.targets
print(f"{type(dataset)} - {type(X)} - {type(y)}")

<class 'ucimlrepo.dotdict.dotdict'> - <class 'pandas.core.frame.DataFrame'> - <class 'pandas.core.frame.DataFrame'>


In [5]:
print(dataset.metadata)
print(dataset.varibles)

{'uci_id': 15, 'name': 'Breast Cancer Wisconsin (Original)', 'repository_url': 'https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original', 'data_url': 'https://archive.ics.uci.edu/static/public/15/data.csv', 'abstract': 'Original Wisconsin Breast Cancer Database', 'area': 'Life Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 699, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': ['Sample_code_number'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1990, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5HP4Z', 'creators': ['WIlliam Wolberg'], 'intro_paper': None, 'additional_info': {'summary': "Samples arrive periodically as Dr. Wolberg reports his clinical cases. The database therefore reflects this chronological grouping of the data. This grouping information appears immediately below, having been removed from the d

In [6]:
import pandas as pd

wis_url = "https://archive.ics.uci.edu/static/public/15/data.csv"
df = pd.read_csv(wis_url, index_col=0)
df.head()

Unnamed: 0_level_0,Clump_thickness,Uniformity_of_cell_size,Uniformity_of_cell_shape,Marginal_adhesion,Single_epithelial_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
Sample_code_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2
