In [None]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.colors as mcolors

# Prep data

In [10]:
body_order = [
    'Very light-bodied',
    'Light-bodied',
    'Medium-bodied',
    'Full-bodied',
    'Very full-bodied',
]

acidity_order = [
    'Low',
    'Medium',
    'High'
]

food_pairings_cat = {
    'Meat': [
        'Beef',
        'Cured Meat',
        'Game Meat',
        'Lamb',
        'Pork',
        'Poultry',
        'Veal',
    ],
    'Seafood': [
        'Lean Fish',
        'Rich Fish',
        'Shellfish',
    ],
    'Dairy': [
        'Blue Cheese',
        'Goat Cheese',
        'Hard Cheese',
        'Maturated Cheese',
        'Soft Cheese',
    ],
    'Spicy Food': ['Spicy Food'],
    'Pasta': ['Pasta'],
    'Mushrooms': ['Mushrooms'],
    'Vegetables':['Vegetarian'],
    'Sweet':[
        'Sweet Dessert',
        'Fruit Dessert',
    ]
}

cmap_abv = mcolors.LinearSegmentedColormap.from_list(
    "ibu_blue_red",
    ["#9caaea", "#000623"],  # darker blue to darker red
    N=120
)
border_col = "#e3d7ff"  # dark blue
grid_col = "#e3d7ff"  # light purple

In [None]:
# Load the data. from https://github.com/rogerioxavier/X-Wines

data = pd.read_csv('./data/All-XWines_Full_100K_wines_21M_ratings/XWines_Full_100K_wines.csv')
data = data[data['Type'].isin(['White', 'Red'])]  # filter for only white and red wines

# Convert 'Grapes' column to a list of lists
data['Grapes'] = data['Grapes'].apply(lambda x: eval(x) if isinstance(x, str) else x)
# remove rows with more than 1 grape
data = data[data['Grapes'].apply(lambda x: isinstance(x, list) and len(x) == 1)]
# convert 'Grapes' column to string
data['Grapes'] = data['Grapes'].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)

# get the top 25 most common grapes for white and red wines
grapes_to_keep = []
for wt in ['White', 'Red']:
    freq = data[data['Type'] == wt].value_counts('Grapes').head(25)
    grapes_to_keep.extend(freq.index.tolist())

data = data[data['Grapes'].isin(grapes_to_keep)]


# collate food pairings
data['Harmonize'] = data['Harmonize'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# get unique list of food pairings
food_pairings = set()
for harmonize in data['Harmonize']:
    if isinstance(harmonize, list):
        food_pairings.update(harmonize)
food_pairings = list(food_pairings)

# add indicator columns for each food pairing
for food in food_pairings:
    data[food] = data['Harmonize'].apply(lambda x: 1 if isinstance(x, list) and food in x else 0)
    
# remove food pairings that are not present in at least 50 wines
food_pairings = [food for food in food_pairings if data[food].sum() >= 50]

# get rid of 'Appetizer' and 'Snack' food pairings
food_pairings = [food for food in food_pairings if food not in ['Appetizer', 'Snack']]

print(data.shape)

(54518, 60)


In [7]:
data.head()

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,...,Spicy Food,Lean Fish,Game Meat,Maturated Cheese,Dessert,Shellfish,Sushi,Chicken,Cheese,Soft Cheese
2,100003,Cabernet Sauvignon,Red,Varietal/100%,Cabernet Sauvignon,"[Beef, Lamb, Poultry]",12.0,Full-bodied,High,BR,...,0,0,0,0,0,0,0,0,0,0
5,100006,Reserva Cabernet Sauvignon,Red,Varietal/100%,Cabernet Sauvignon,"[Beef, Lamb, Poultry]",12.5,Full-bodied,High,BR,...,0,0,0,0,0,0,0,0,0,0
7,100008,Paradoxo Cabernet Sauvignon,Red,Varietal/100%,Cabernet Sauvignon,"[Beef, Lamb, Poultry]",13.5,Full-bodied,High,BR,...,0,0,0,0,0,0,0,0,0,0
17,100018,Cave Cabernet Sauvignon,Red,Varietal/100%,Cabernet Sauvignon,"[Beef, Lamb, Poultry]",13.0,Full-bodied,High,BR,...,0,0,0,0,0,0,0,0,0,0
18,100019,Singular Nebbiolo,Red,Varietal/100%,Nebbiolo,"[Beef, Lamb, Veal, Maturated Cheese, Hard Chee...",14.9,Full-bodied,High,BR,...,0,0,0,1,0,0,0,0,0,0


In [9]:
data.value_counts('Acidity')

Acidity
High      44568
Medium     8301
Low        1649
Name: count, dtype: int64

In [8]:
data.value_counts('Body')

Body
Full-bodied          23560
Medium-bodied        17168
Very full-bodied      7694
Light-bodied          6009
Very light-bodied       87
Name: count, dtype: int64

# Create plot

In [11]:
row_heights= [
    data[data['Type'] == 'Red']['Grapes'].nunique(),
    data[data['Type'] == 'White']['Grapes'].nunique(),
]
row_heights = np.array(row_heights) / np.sum(row_heights)
row_heights = row_heights.tolist()


fig = make_subplots(
    rows=2,
    cols=3,
    row_heights=row_heights,
    column_widths=[0.5,0.25, 0.25],
    vertical_spacing=0.02,
    horizontal_spacing=0.03,
)

for k, wine_type in enumerate(['Red', 'White']):
    
    data_s = data[data['Type'] == wine_type]
    
    # group by grape type, and get sum of pairings
    tmp = data_s.groupby('Grapes')[food_pairings].sum()

    # divide each row by the row total
    tmp = tmp.div(tmp.sum(axis=1), axis=0)
    
    # get average body by grape type
    body = []
    acidity = []
    avg = []
    for cat, tmp_i in data_s.groupby('Grapes'):
        row = {
            'Grape': cat
        }
        avg_i = []
        for j, body_i in enumerate(body_order):
            n = tmp_i[tmp_i['Body'] == body_i].shape[0]
            row[body_i] = n
            body.append({
                'Grape': cat,
                'Body': body_i,
                'Percent': 100 * (n / tmp_i.shape[0])
            })
            avg_i += [j] * n

        for j, acidity_i in enumerate(acidity_order):
            n = tmp_i[tmp_i['Acidity'] == acidity_i].shape[0]
            row[acidity_i] = n
            acidity.append({
                'Grape': cat,
                'Acidity': acidity_i,
                'Percent': 100 * (n / tmp_i.shape[0])
            })

        avg.append({
            'Grape': cat,
            'Average Body': float(np.mean(avg_i))
        })
    avg = pd.DataFrame(avg)
    avg = avg.set_index('Grape')
    order = avg.sort_values('Average Body', ascending=True).index 

    body = pd.DataFrame(body)
    body['Grape'] = pd.Categorical(body['Grape'], categories=order, ordered=True)
    body['Body'] = pd.Categorical(body['Body'], categories=body_order, ordered=True)
    body = body.sort_values(['Grape', 'Body'])

    acidity = pd.DataFrame(acidity)
    acidity['Grape'] = pd.Categorical(acidity['Grape'], categories=order, ordered=True)
    acidity['Acidity'] = pd.Categorical(acidity['Acidity'], categories=acidity_order, ordered=True)
    acidity = acidity.sort_values(['Grape', 'Acidity'])
        
    # stacked bar plot of body
    fig.add_trace(
        go.Bar(
            y=body['Grape'],
            x=body['Percent'],
            name=wine_type,
            # make horizontal
            orientation='h',
            marker=dict(
                color=body['Body'].map({
                    'Very light-bodied': "#ffc4cd",
                    'Light-bodied': "#f48999",
                    'Medium-bodied': "#E4627A",
                    'Full-bodied': "#82192A",
                    'Very full-bodied': "#440612"
                })
            ),
            showlegend=False
        ),
        row=k+1, col=1
    )
    fig.update_xaxes(
        showticklabels=True if k == 0 else False,
        side='top',
        row=k+1, col=1,
        range=[0, 100],
        linewidth=1, linecolor=border_col, mirror=True
    )
    fig.update_yaxes(
        # showticklabels=False,
        linewidth=1, linecolor=border_col, mirror=True,
        row=k+1, col=1
    )
    
    # stacked bar plot of acidity
    fig.add_trace(
        go.Bar(
            y=acidity['Grape'],
            x=acidity['Percent'],
            name=wine_type,
            # make horizontal
            orientation='h',
            marker=dict(
                color=acidity['Acidity'].map({
                    'Low': "#fff0c0",
                    'Medium': "#e0c25c",
                    'High': "#c29604"
                })
            ),
            showlegend=False
        ),
        row=k+1, col=2
    )
    
    # hide x and y axis ticks and labels
    fig.update_xaxes(
        showticklabels=True if k == 0 else False,
        side='top',
        range=[0, 100],
        row=k+1, col=2,
        linewidth=1, linecolor=border_col, mirror=True
    )
    fig.update_yaxes(
        showticklabels=False,
        linewidth=1, linecolor=border_col, mirror=True,
        row=k+1, col=2
    )
    
    # abv box plot horizontal
    
    data_s['Grapes'] = pd.Categorical(data_s['Grapes'], categories=order, ordered=True)
    data_s = data_s.sort_values('Grapes')
    for grape_type in order:
        data_s_grape = data_s[data_s['Grapes'] == grape_type]
        
        col_i = (data_s_grape['ABV'].median() - 12.5) / (15 - 12.5)
        col_i = mcolors.to_hex(cmap_abv(col_i))
        
        fig.add_trace(
            go.Box(
                x=data_s_grape['ABV'],
                y=data_s_grape['Grapes'],
                name=wine_type,
                orientation='h',
                marker=dict(
                    size=2,
                    color=col_i
                ),
                showlegend=False
            ),
            row=k+1, col=3
        )
    # range from 5 - 20
    fig.update_xaxes(
        range=[8, 18],
        # move ticks to the top of the plot
        side='top',
        showticklabels=True if k == 0 else False,
        row=k+1, col=3,
        linewidth=1, linecolor=border_col, mirror=True
    )
    fig.update_yaxes(
        showticklabels=False,
        row=k+1, col=3,
        linewidth=1, linecolor=border_col, mirror=True
    )

fig.update_layout(
    showlegend=False,
    paper_bgcolor="#f5edff",  # very light yellow
    plot_bgcolor="#fef4fa",   # very light yellow
    font=dict(family="Helvetica, sans-serif", size=20, color="#000", weight='bold'),
)

fig.update_yaxes(gridcolor=grid_col)
fig.update_xaxes(gridcolor=grid_col, zerolinecolor=grid_col)

fig.write_image('./plots/wines.svg', width=1000, height=1900)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

