In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml, make_blobs # make_moons, make_circles
from sklearn.preprocessing import StandardScaler

# Classification

### UCI Iris Dataset - Binary Classification

The objective is to predict the flower species, encoded as {0,1,2}.
It is a small dataset containing only 4 features that is well-known in the ML community.

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = [
    "Sepal Length", 
    "Sepal Width", 
    "Petal Length", 
    "Petal Width", 
    "Species" # Target
]
df = pd.read_csv(url, header=None, names=columns)

In [None]:
df.head()

In [None]:
df.to_csv("../data/iris.csv", index=False)

### UCI Wine Dataset - Multiclass Classification

The objective is to predict the wine type from different regions in Italy, encoded as {0,1,2}.  
It is a small, well-structured dataset with 13 features.

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
columns = [
    "Class", # Target
    "Alcohol", 
    "Malic Acid", 
    "Ash", 
    "Alcalinity of Ash",
    "Magnesium", 
    "Total Phenols",
    "Flavanoids",
    "Nonflavanoid Phenols",
    "Proanthocyanins",    
    "Color Intensity", 
    "Hue", 
    "OD280/OD315", 
    "Proline"
]

df = pd.read_csv(url, header=None, names=columns)

In [None]:
df.head()

In [None]:
df.to_csv("../data/wine.csv", index=False)

# Regression

### Sklearn Boston Housing Dataset 

The objective is to predict the house price, using the median value of owner-occupied homes in $1000s, ```MEDV``` as the target.  
It is one of the most widely used regression datasets in ML research with 13 features.

In [None]:
df = fetch_openml(name="boston", version=1, as_frame=True).frame

In [None]:
df.head()

In [None]:
df.to_csv("../data/boston_housing.csv", index=False)

### UCI Energy Efficiency Dataset - Multi Regression

The objective is to predict energy efficiency of buildings through two targets: (i) ```Heating Load```, the amount of energy required for heating, and (ii) ```Cooling Load```, the amount of energy required for cooling.
It is composed of 8 features.

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
df = pd.read_excel(url)
df.columns = [
    "Relative Compactness", 
    "Surface Area", 
    "Wall Area", 
    "Roof Area", 
    "Overall Height", 
    "Orientation", 
    "Glazing Area", 
    "Glazing Area Distribution", 
    "Heating Load", 
    "Cooling Load"
]

In [None]:
df.head()

In [None]:
df.to_csv("../data/energy_efficiency.csv", index=False)

# Clustering

In [2]:
n_samples = 300 # Small: 300 | Medium: 5000 | Large: 20000
n_features = 4 # Small: 4 | Medium: 50 | Large: 300
n_clusters = 3 # Small: 3 | Medium: 10 | Large: 50
cluster_std = 1.0 # Small: 0.5 | Medium: 1.0 | Large: 2.0

In [3]:
X, y, centers = make_blobs(
    n_samples=n_samples, # number of samples
    n_features=n_features, # number of features
    centers=n_clusters, # number of clusters (y contains the cluster labels)
    cluster_std=cluster_std, # standard deviation of the clusters, the higher the more spread out the clusters are and the harder to separate them
    center_box=(-10.0, 10.0), # the range of the centers
    shuffle=True, # shuffle the data
    random_state=42, # random state
    return_centers=True # return the centers
)

In [4]:
df = pd.DataFrame(X, columns=[f"Feature {i+1}" for i in range(X.shape[1])])
# df["Species"] = [f"Cluster-{label}" for label in y]
df["Species"] = [int(label)+1 for label in y]

In [5]:
df.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Species
0,0.741996,5.033909,-8.938109,9.299021,3
1,-1.651538,8.854348,4.620863,0.97064,1
2,2.126501,4.098858,-10.342275,9.117522,3
3,-6.400647,-6.546447,-7.800788,6.813507,2
4,3.071853,3.626216,-8.270916,9.595797,3


In [6]:
scaler = StandardScaler()
feature_columns = [col for col in df.columns if col != "Species"]
df[feature_columns] = scaler.fit_transform(df[feature_columns])

In [7]:
df.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Species
0,0.811794,0.431383,-0.667598,0.919633,3
1,0.195979,0.995486,1.395606,-1.606078,1
2,1.168003,0.293319,-0.881263,0.86459,3
3,-1.025886,-1.278504,-0.494537,0.165862,2
4,1.411226,0.223531,-0.566074,1.009634,3


In [8]:
df.to_csv(f"../data/blobs_{n_samples}_{n_features}_{n_clusters}_{cluster_std}.csv", index=False)