In [11]:
import requests
import zipfile
import io
import pandas as pd
from pyprojroot.here import here
from duckdb import sql

def download_and_load_csvs_from_zip(zip_url, csv_paths: str | list[str]) -> list:
    if type(csv_paths) is str: csv_paths = [csv_paths]
    response = requests.get(zip_url)
    assert response.status_code == 200, response.status_code
    with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
        dataframes = []
        for path in csv_paths:
            with the_zip.open(path) as csv_file:
                csv_content = csv_file.read().decode('utf-8')
                df = pd.read_csv(io.StringIO(csv_content), delimiter=";")
                dataframes.append(df)
    return dataframes

red, white = download_and_load_csvs_from_zip("https://archive.ics.uci.edu/static/public/186/wine+quality.zip", ["winequality-red.csv", "winequality-white.csv"])

red["type"] = "red"
white["type"] = "white"
wine = pd.concat([red, white], axis=0)
wine


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [16]:
# normalize columns
typ = wine["type"]
quality = wine["quality"]
wine_cont = wine.drop(columns = ["type", "quality"])
wine_norm = (wine_cont - wine_cont.mean()) / wine_cont.std()
wine_norm["type"] = typ
wine_norm["quality"] = quality
wine_norm

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,quality
0,0.142462,2.188664,-2.192664,-0.744721,0.569914,-1.100055,-1.446247,1.034913,1.812950,0.193082,-0.915394,red,5
1,0.451001,3.281982,-2.192664,-0.597594,1.197883,-0.311296,-0.862402,0.701432,-0.115064,0.999502,-0.580023,red,5
2,0.451001,2.553104,-1.917405,-0.660648,1.026618,-0.874695,-1.092402,0.768128,0.258100,0.797897,-0.580023,red,5
3,3.073580,-0.362411,1.660957,-0.744721,0.541370,-0.762016,-0.986248,1.101609,-0.363840,0.327485,-0.580023,red,6
4,0.142462,2.188664,-2.192664,-0.744721,0.569914,-1.100055,-1.446247,1.034913,1.812950,0.193082,-0.915394,red,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,-0.783154,-0.787590,-0.197039,-0.807775,-0.486215,-0.367636,-0.420095,-1.186069,0.320294,-0.210128,0.593772,white,6
4894,-0.474615,-0.119451,0.284664,0.537383,-0.257863,1.491582,0.924517,0.067819,-0.426034,-0.478935,-0.747709,white,5
4895,-0.551750,-0.605370,-0.885185,-0.891847,-0.429127,-0.029596,-0.083942,-0.719196,-1.421138,-0.478935,-0.915394,white,6
4896,-1.323097,-0.301671,-0.128224,-0.912866,-0.971463,-0.592996,-0.101635,-2.003097,0.755652,-1.016548,1.935253,white,7


In [35]:
# train and validation split
from sklearn.model_selection import train_test_split
from numpy.random import seed
seed(1)

train, val = train_test_split(wine_norm, test_size=0.1)
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,quality
136,0.836674,2.279774,-1.160444,-0.765739,0.940986,-1.156395,-1.127786,0.701432,0.071518,1.604317,-0.831551,red,5
313,1.068078,0.791647,-0.128224,-0.513522,0.569914,-0.029596,0.340672,0.968217,0.506876,-0.008523,-0.915394,red,5
955,0.990944,-0.787590,1.385699,-0.744721,0.969530,-1.212735,-1.640862,0.594718,0.880040,0.932300,-0.076968,red,5
4299,-0.706019,-0.362411,-0.128224,0.243130,4.337725,1.660602,0.676825,0.027801,-0.737004,-0.882144,-0.915394,white,6
52,-0.783154,-1.091289,0.078220,-0.912866,0.027577,-0.536656,-0.597018,-1.232757,0.631264,-0.478935,0.342245,white,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,1.530886,1.459786,-0.816370,-0.513522,0.712634,-0.874695,-0.013173,1.101609,0.071518,0.394687,-0.831551,red,5
3593,-1.245962,-0.362411,-0.265853,-0.261304,-0.343495,1.209882,0.747594,-0.899276,0.817846,-0.613338,0.174560,white,7
2381,-0.166076,-0.666110,0.697552,-0.072142,-0.400583,0.364783,0.499903,0.161193,1.750756,0.394687,-0.244653,white,6
235,-0.011807,1.763485,-2.192664,-0.744721,1.169338,-0.931035,-1.375478,0.684758,0.942234,0.327485,-1.250764,red,6


In [None]:
# shuffle
seed(1)
train_s = train.sample(frac=1).reset_index(drop=True)
# 5 fold CV
folds = [train_s.iloc[i*len(train_s)//5:(i+1)*len(train_s)//5].reset_index(drop=True) for i in range(5)]
[len(f) for f in folds]

[1169, 1169, 1170, 1169, 1170]