In [2]:
import pandas as pd
import numpy as np

In [5]:
def assign_quant (dataset, col, data_type=None):
    """
    Returns 4 quantiles to be able to split the data evenly into five groups
    """
    if data_type:
        dataset = dataset[dataset["type"]==data_type]
    quants =  [np.quantile(dataset[col], i/10) for i in range(2,10,2)]
    return quants


In [30]:
def split_data(row, attribute, quants_red, quants_white, data_type=None):
    """
    split_data takes a row, an attribute, two sets of quantiles, and possibly a given type of wine in the dataset and calculates a set score based on its
    position in the quantiles. This function is meant to be applied to a data set through an .apply() method

    Args:
    row : row of dataset to be evaluated
    attribute : str, a given attribute to be scored from 0-4, NEEDS TO BE IN QUOTATION MARKS
    quants_red : a list of 4 numeric values associated with 0.2, 0.4, 0.6, 0.8, and 1.0 quantile respectively should be red wine only
    quants_white : similar to quants red only meant to evaluate white wines
    data_type : str, needs to either be 'red' or 'white' to have any functionality 

    Returns: 0-4 based on row quantile score

    Why use data_type? There are two reasons why I think you might want to use data_type, first if you have dataset that is only red or white wine and 
    there is no attribute for type then, you can use data_type to avoid a key error. Also say you do not want to give different quantile scores for red vs.
    white wines. Simply set data_type to 'red' and feel free to set quants_white to None and its always go down one path of the if else.
    """
    if (data_type == 'red') | (row["type"] =="red"):
        if row[attribute] < quants_red[0]:
            return 4
        elif quants_red[1] > row[attribute] >= quants_red[0]:
            return 3
        elif quants_red[2] > row[attribute] >= quants_red[1]:
            return 2
        elif quants_red[3] > row[attribute] >= quants_red[2]:
            return 1
        else:
            return 0
    elif  (data_type == "white") | (row["type"] == "white"):
        if row[attribute] < quants_white[0]:
            return 4
        elif quants_white[1] > row[attribute] >= quants_white[0]:
            return 3
        elif quants_white[2] > row[attribute] >= quants_white[1]:
            return 2
        elif quants_white[3] > row[attribute] >= quants_white[2]:
            return 1
        else:
            return 0
    

In [18]:
#TEST 
wine = pd.read_csv("wine-quality-white-and-red.csv")

In [19]:
red_quants = assign_quant(wine, "pH", "red")
white_quants = assign_quant(wine, "pH", "white")
red_quants, white_quants

([3.18, 3.28, 3.35, 3.424000000000001], [3.06, 3.14, 3.22, 3.31])

In [29]:
wine["new_col"] = wine.apply(split_data, axis=1, args=("pH",red_quants,white_quants))
wine.head()

white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
white
whit

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,new_col
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,4
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,2
