In [1]:
import requests
import zipfile
import io
import pandas as pd
from pyprojroot.here import here
from duckdb import sql
import os
import joblib

def download_and_load_csvs_from_zip(zip_url, csv_paths: str | list[str]) -> list:
    if type(csv_paths) is str: csv_paths = [csv_paths]
    response = requests.get(zip_url)
    assert response.status_code == 200, response.status_code
    with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
        dataframes = []
        for path in csv_paths:
            with the_zip.open(path) as csv_file:
                csv_content = csv_file.read().decode('utf-8')
                df = pd.read_csv(io.StringIO(csv_content), delimiter=";")
                dataframes.append(df)
    return dataframes
if not os.path.exists(here("data/wine.joblib")): 
  red, white = download_and_load_csvs_from_zip("https://archive.ics.uci.edu/static/public/186/wine+quality.zip", ["winequality-red.csv", "winequality-white.csv"])
  red["type"] = "red"
  white["type"] = "white"
  wine = pd.concat([red, white], axis=0)
  joblib.dump(wine, here("data/wine.joblib"))
else: 
  wine = joblib.load(here("data/wine.joblib"))
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [2]:
# change type to is_red
wine_dum = wine.copy()
wine_dum["is_red"] = wine["type"] == "red"
wine_dum.drop(columns = ["type"], inplace = True)

wine_dum

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,True
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,True
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,True
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,True
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,False
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,False
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,False
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,False


In [3]:
from numpy.random import seed
seed(1)
wine_shuffled = wine_dum.sample(frac=1).reset_index(drop=True)
# 5 fold CV
fold_n = 5
folds = [wine_shuffled.iloc[i*len(wine_shuffled)//fold_n:(i+1)*len(wine_shuffled)//fold_n] for i in range(fold_n)] # .reset_index(drop=True)?
[len(f) for f in folds]

[1299, 1299, 1300, 1299, 1300]

In [4]:
for i in range(len(folds)):
  train_cv, test_cv = pd.concat(folds[j] for j in range(len(folds)) if j != i), folds[i]
  break
train_cv

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
1299,6.9,0.22,0.37,15.00,0.053,59.0,178.0,0.99920,3.37,0.82,9.5,7,white
1300,6.3,0.13,0.42,1.10,0.043,63.0,146.0,0.99066,3.13,0.72,11.2,7,white
1301,7.4,0.61,0.01,2.00,0.074,13.0,38.0,0.99748,3.48,0.65,9.8,5,red
1302,6.4,0.28,0.27,11.00,0.042,45.0,148.0,0.99786,3.14,0.46,8.7,5,white
1303,8.1,0.20,0.40,2.00,0.037,19.0,87.0,0.99210,3.12,0.54,11.2,6,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,9.2,0.58,0.20,3.00,0.081,15.0,115.0,0.99800,3.23,0.59,9.5,5,red
6493,5.6,0.28,0.28,4.20,0.044,52.0,158.0,0.99200,3.35,0.44,10.7,7,white
6494,7.0,0.23,0.42,5.10,0.042,37.0,144.0,0.99518,3.50,0.59,10.2,6,white
6495,7.2,0.63,0.00,1.90,0.097,14.0,38.0,0.99675,3.37,0.58,9.0,6,red


In [6]:
# export folds
joblib.dump(folds, here("data/folds.joblib"))
pass