In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml

# Classification

### UCI Iris Dataset - Binary Classification

The objective is to predict the flower species, encoded as {0,1,2}.
It is a small dataset containing only 4 features that is well-known in the ML community.

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = [
    "Sepal Length", 
    "Sepal Width", 
    "Petal Length", 
    "Petal Width", 
    "Species" # Target
]
df = pd.read_csv(url, header=None, names=columns)

In [3]:
df.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.to_csv("../data/iris.csv", index=False)

### UCI Wine Dataset - Multiclass Classification

The objective is to predict the wine type from different regions in Italy, encoded as {0,1,2}.  
It is a small, well-structured dataset with 13 features.

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
columns = [
    "Class", # Target
    "Alcohol", 
    "Malic Acid", 
    "Ash", 
    "Alcalinity of Ash",
    "Magnesium", 
    "Total Phenols",
    "Flavanoids",
    "Nonflavanoid Phenols",
    "Proanthocyanins",    
    "Color Intensity", 
    "Hue", 
    "OD280/OD315", 
    "Proline"
]

df = pd.read_csv(url, header=None, names=columns)

In [6]:
df.head()

Unnamed: 0,Class,Alcohol,Malic Acid,Ash,Alcalinity of Ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color Intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [7]:
df.to_csv("../data/wine.csv", index=False)

# Regression

### Sklearn Boston Housing Dataset 

The objective is to predict the house price, using the median value of owner-occupied homes in $1000s, ```MEDV``` as the target.  
It is one of the most widely used regression datasets in ML research with 13 features.

In [8]:
df = fetch_openml(name="boston", version=1, as_frame=True).frame

In [9]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [10]:
df.to_csv("../data/boston_housing.csv", index=False)

### UCI Energy Efficiency Dataset - Multi Regression

The objective is to predict energy efficiency of buildings through two targets: (i) ```Heating Load```, the amount of energy required for heating, and (ii) ```Cooling Load```, the amount of energy required for cooling.
It is composed of 8 features.

In [11]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
df = pd.read_excel(url)
df.columns = [
    "Relative Compactness", 
    "Surface Area", 
    "Wall Area", 
    "Roof Area", 
    "Overall Height", 
    "Orientation", 
    "Glazing Area", 
    "Glazing Area Distribution", 
    "Heating Load", 
    "Cooling Load"
]

In [12]:
df.head()

Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [13]:
df.to_csv("../data/energy_efficiency.csv", index=False)