# Libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Personas

In [2]:
personas = {
    'Caffeine': {'Caffeine Junkie Richard': 10, 'Purist Pierce': 4, 'Stingy Steve': 7, 'Hippie Hendrix': 3},
    'Sugar': {'Caffeine Junkie Richard': 5, 'Purist Pierce': 3, 'Stingy Steve': 7, 'Hippie Hendrix': 0},
    'Artificial Sweetener': {'Caffeine Junkie Richard': 6, 'Purist Pierce': 0, 'Stingy Steve': 5, 'Hippie Hendrix': 0},
    'Price': {'Caffeine Junkie Richard': 10, 'Purist Pierce': 7, 'Stingy Steve': 0, 'Hippie Hendrix': 8},
    'Size': {'Caffeine Junkie Richard': 4, 'Purist Pierce': 4, 'Stingy Steve': 10, 'Hippie Hendrix': 3},
    'Calories': {'Caffeine Junkie Richard': 4, 'Purist Pierce': 3, 'Stingy Steve': 7, 'Hippie Hendrix': 2},
    'Energy Drink': {'Caffeine Junkie Richard': 8, 'Purist Pierce': 0, 'Stingy Steve': 5, 'Hippie Hendrix': 0},
    'Coffee': {'Caffeine Junkie Richard': 5, 'Purist Pierce': 10, 'Stingy Steve': 5, 'Hippie Hendrix': 8},
    'Tea': {'Caffeine Junkie Richard': 0, 'Purist Pierce': 6, 'Stingy Steve': 5, 'Hippie Hendrix': 10}
}

df_persona = pd.DataFrame.from_dict(personas).reset_index()
df_persona.columns = ['Persona', 'Caffeine', 'Sugar', 'Artificial Sweetener', 
                      'Price', 'Size', 'Calories', 'Energy Drink', 'Coffee', 'Tea']
df_persona.head()


Unnamed: 0,Persona,Caffeine,Sugar,Artificial Sweetener,Price,Size,Calories,Energy Drink,Coffee,Tea
0,Caffeine Junkie Richard,10,5,6,10,4,4,8,5,0
1,Purist Pierce,4,3,0,7,4,3,0,10,6
2,Stingy Steve,7,7,5,0,10,7,5,5,5
3,Hippie Hendrix,3,0,0,8,3,2,0,8,10


In [3]:
scaler = StandardScaler()
features_to_standardize = df_persona.columns[1:]
df_persona_std = df_persona.copy()
df_persona_std[features_to_standardize] = scaler.fit_transform(df_persona[features_to_standardize])
df_persona_std.head()

Unnamed: 0,Persona,Caffeine,Sugar,Artificial Sweetener,Price,Size,Calories,Energy Drink,Coffee,Tea
0,Caffeine Junkie Richard,1.460593,0.483368,1.172171,0.995585,-0.450835,0.0,1.389418,-0.942809,-1.473911
1,Purist Pierce,-0.730297,-0.290021,-0.991837,0.199117,-0.450835,-0.534522,-0.950654,1.414214,0.210559
2,Stingy Steve,0.365148,1.256757,0.811503,-1.659308,1.713172,1.603567,0.511891,-0.942809,-0.070186
3,Hippie Hendrix,-1.095445,-1.450105,-0.991837,0.464606,-0.811503,-1.069045,-0.950654,0.471405,1.333539


# Caffeine

In [4]:
df_caffeine = pd.read_csv('./Caffeine.csv')
df_caffeine.head()

Unnamed: 0,Location,Drink Name,Type,Price ($),Caffeine (mg),Calories,Volume (oz),Sugar (g),Artificial Sweetner,Hot/Cold
0,Campus Market,Redbull,Energy Drink,4.85,80.0,110,8.4,26,N,Cold
1,Campus Market,Redbull (large),Energy Drink,6.8,114.0,160,12.0,38,N,Cold
2,Campus Market,Redbull Sugarfree,Energy Drink,4.85,80.0,10,8.4,0,Y,Cold
3,Campus Market,Redbull Sugarfree (large),Energy Drink,6.8,114.0,20,12.0,0,Y,Cold
4,Campus Market,Yerba Mate (Normal),Tea,4.2,150.0,120,16.0,27,N,Cold


In [5]:
df_caffeine['Artificial Sweetener'] = df_caffeine['Artificial Sweetner'].apply(lambda x: 1 if x == 'Y' else 0)
df_caffeine.drop(columns=['Artificial Sweetner'], inplace=True)
df_caffeine['Cold'] = df_caffeine['Hot/Cold'].apply(lambda x: 1 if x == 'Cold' else 0)
df_caffeine.drop(columns=['Hot/Cold'], inplace=True)
df_caffeine['Energy Drink'] = df_caffeine['Type'].apply(lambda x: 1 if x == 'Energy Drink' else 0).astype("object")
df_caffeine['Coffee'] = df_caffeine['Type'].apply(lambda x: 1 if x == 'Coffee' else 0).astype("object")
df_caffeine['Tea'] = df_caffeine['Type'].apply(lambda x: 1 if x == 'Tea' else 0).astype("object")
df_caffeine.drop(columns=['Type'], inplace=True)

df_caffeine.rename(columns={"Price ($)": "Price",
                            "Caffeine (mg)": "Caffeine",
                            "Sugar (g)": "Sugar",
                            "Volume (oz)": "Size"}, inplace=True)
df_caffeine.head()

Unnamed: 0,Location,Drink Name,Price,Caffeine,Calories,Size,Sugar,Artificial Sweetener,Cold,Energy Drink,Coffee,Tea
0,Campus Market,Redbull,4.85,80.0,110,8.4,26,0,1,1,0,0
1,Campus Market,Redbull (large),6.8,114.0,160,12.0,38,0,1,1,0,0
2,Campus Market,Redbull Sugarfree,4.85,80.0,10,8.4,0,1,1,1,0,0
3,Campus Market,Redbull Sugarfree (large),6.8,114.0,20,12.0,0,1,1,1,0,0
4,Campus Market,Yerba Mate (Normal),4.2,150.0,120,16.0,27,0,1,0,0,1


In [6]:
relevant_features = df_caffeine[['Caffeine', 'Sugar', 'Artificial Sweetener', 'Price', 'Size', 'Calories', 'Energy Drink', 'Coffee', 'Tea']]
scaler = StandardScaler()
standardized_drinks = scaler.fit_transform(relevant_features)
df_persona_std = scaler.transform(df_persona.iloc[:, 1:])

drink_names = df_caffeine['Drink Name'].values
df_final = pd.DataFrame(index=df_persona['Persona'], columns=df_caffeine['Drink Name'])

for idx, persona in df_persona.iterrows():
    persona_std = df_persona_std[idx]
    distances = np.sqrt(np.sum((standardized_drinks - persona_std) ** 2, axis=1))
    df_final.loc[persona['Persona']] = distances

df_final.reset_index(inplace=True)
df_final.rename(columns={'index': 'Persona'}, inplace=True)

df_final.head()

Drink Name,Persona,Redbull,Redbull (large),Redbull Sugarfree,Redbull Sugarfree (large),Yerba Mate (Normal),Yerba Mate (Tropical Uprising),Yerba Mate (Berry Lemonade),Yerba Mate (Bottle),Yerba Mate (Classic Gold),...,Iced Caramel Macchiato,Matcha Tea Latte,Honey Citrus Mint Tea,Chai Tea Latte,Earl Grey Tea,Royal English Breakfast Tea,Iced Black Tea Lemonade,Iced Chai Tea Latte,Iced Matcha Tea Latte,Iced Peach Green Tea
0,Caffeine Junkie Richard,28.108665,27.929457,25.920335,25.667693,29.950137,29.913226,29.907216,29.933066,29.904738,...,29.096582,29.808794,29.893394,29.819063,30.008792,30.008792,29.884644,29.824989,29.800688,29.881555
1,Purist Pierce,24.225698,24.267037,24.391083,24.368111,23.282814,23.231442,23.221497,23.259675,23.183002,...,22.742947,23.236779,23.211328,23.250796,23.248919,23.248919,23.200116,23.259307,23.226337,23.195267
2,Stingy Steve,24.503364,25.048301,22.441233,22.973258,24.654948,24.61424,24.609343,24.634753,24.528424,...,25.179926,24.909129,24.603103,24.918795,24.422201,24.422201,24.59392,24.92631,24.899549,24.589756
3,Hippie Hendrix,27.417116,27.392528,27.555572,27.470715,25.907368,25.856109,25.844279,25.885556,25.820021,...,26.409306,25.826445,25.841283,25.841858,25.896805,25.896805,25.830105,25.849652,25.817011,25.82562


In [7]:
drink_names = df_caffeine['Drink Name'].values
df_distances = pd.DataFrame(columns=drink_names)

for idx, persona in df_persona.iterrows():
    persona_std = df_persona_std[idx]
    distances = np.sqrt(np.sum((standardized_drinks - persona_std) ** 2, axis=1))
    df_distances.loc[persona['Persona']] = distances

df_distances.reset_index(inplace=True)
df_distances.rename(columns={'index': 'Persona'}, inplace=True)

def print_top_n_closest_drinks(df, n):
    for idx, row in df.iterrows():
        sorted_drinks = row[1:].sort_values()[:n]
        print(f"Persona: {row['Persona']}")
        for drink in sorted_drinks.index:
            print(f"{drink}")
        print()

print_top_n_closest_drinks(df_distances, 5)


Persona: Caffeine Junkie Richard
Redbull Sugarfree (large)
Redbull Sugarfree
Reign Storm 12oz
Mega Monster Energy Zero Ultra
Monster Energy Zero Ultra

Persona: Purist Pierce
Rise Brewing Oat Milk Vanilla
Rise Brewing Oat Milk Mocha
Califia farms salted carmel almond latte
Califia farms mocha almond latte
Califia farms espresso almond latte

Persona: Stingy Steve
Monster Energy Zero Ultra
Monster Energy Ultra Sunrise
Reign Storm 12oz
Redbull Sugarfree
Reign 16oz

Persona: Hippie Hendrix
Yerba Mate (Cranberry Pomegranate)
Iced Matcha Tea Latte
Yerba Mate (Classic Gold)
Iced Peach Green Tea
Matcha Tea Latte



In [97]:
df_distances.to_csv('./Similarities.csv')

# Recs

In [8]:
nick = np.array([7, 5, 2, 6, 7, 5, 4, 10, 5]).reshape(1, -1)
nick = pd.DataFrame(nick.reshape(1, -1), columns=df_persona.columns[1:])

test_subject_std = scaler.transform(nick)
distances = np.sqrt(np.sum((df_persona_std - test_subject_std) ** 2, axis=1))
distances_series = pd.Series(distances)
closest_persona = df_persona.loc[distances_series.idxmin(), 'Persona']
print(f"Nick: {closest_persona}")
distances = np.sqrt(np.sum((standardized_drinks - test_subject_std) ** 2, axis=1))
df_distances = pd.DataFrame([distances], columns=drink_names)
df_distances.head()

n = 5

sorted_drinks = df_distances.iloc[0].sort_values().head(n)
for drink in sorted_drinks.index:
    print(f"{drink}")

Nick: Purist Pierce
Rise Brewing Oat Milk Vanilla
Califia farms salted carmel almond latte
Califia farms mocha almond latte
Rise Brewing Oat Milk Mocha
Califia farms espresso almond latte


# Demo

In [9]:
professor = np.array([7, 0, 0, 9, 3, 3, 0, 10, 7]).reshape(1, -1)

In [10]:
professor = pd.DataFrame(professor.reshape(1, -1), columns=df_persona.columns[1:])

professor_std = scaler.transform(professor)
distances = np.sqrt(np.sum((df_persona_std - professor_std) ** 2, axis=1))
distances_series = pd.Series(distances)
closest_persona = df_persona.loc[distances_series.idxmin(), 'Persona']
print(f"Professor: {closest_persona}")

Professor: Purist Pierce


In [11]:
distances = np.sqrt(np.sum((standardized_drinks - professor_std) ** 2, axis=1))
df_distances = pd.DataFrame([distances], columns=drink_names)
df_distances.head()

n = 5

sorted_drinks = df_distances.iloc[0].sort_values().head(n)
for drink in sorted_drinks.index:
    print(f"{drink}")

Rise Brewing Oat Milk Vanilla
Rise Brewing Oat Milk Mocha
Califia farms salted carmel almond latte
Califia farms mocha almond latte
Califia farms espresso almond latte


In [12]:
df_caffeine[df_caffeine['Drink Name'].isin(sorted_drinks.index)]

Unnamed: 0,Location,Drink Name,Price,Caffeine,Calories,Size,Sugar,Artificial Sweetener,Cold,Energy Drink,Coffee,Tea
18,Campus Market,Califia farms mocha almond latte,4.79,40.0,110,10.5,13,0,1,0,1,0
19,Campus Market,Califia farms espresso almond latte,4.79,115.0,100,10.5,13,0,1,0,1,0
20,Campus Market,Califia farms salted carmel almond latte,4.79,50.0,100,10.5,13,0,1,0,1,0
21,Campus Market,Rise Brewing Oat Milk Vanilla,4.95,70.0,120,7.0,10,0,1,0,1,0
22,Campus Market,Rise Brewing Oat Milk Mocha,4.95,70.0,150,7.0,10,0,1,0,1,0


# Try Your Own

Caffeine: 0-10 = LOW-HIGH\
Sugar: 0-10 = LOW-HIGH\
Artificial Sweetener: 0-10 = HATE-LOVE\
Price: 0-10 = CHEAP-EXPENSIVE\
Size: 0-10 = SMALL-LARGE\
Calories: 0-10 = LOW-HIGH\
Energy Drink: 0-10 = HATE-LOVE\
Coffee: 0-10 = HATE-LOVE\
Tea: 0-10 = HATE-LOVE

In [13]:
# Fill in array in order above [Caffeine, Sugar, Artificial Sweetener, ..., Tea]
test_subject = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0]).reshape(1, -1)

In [14]:
test_subject = pd.DataFrame(test_subject.reshape(1, -1), columns=df_persona.columns[1:])

your_name_std = scaler.transform(test_subject)
distances = np.sqrt(np.sum((standardized_drinks - your_name_std) ** 2, axis=1))
df_distances = pd.DataFrame([distances], columns=drink_names)
df_distances.head()

# Number of drink recs
n = 5

sorted_drinks = df_distances.iloc[0].sort_values().head(n)
for drink in sorted_drinks.index:
    print(f"{drink}")

Iced Espresso
Espresso
Espresso Macchiato
Starbucks Pike Place Roast
Royal English Breakfast Tea


In [15]:
df_caffeine[df_caffeine['Drink Name'].isin(sorted_drinks.index)]

Unnamed: 0,Location,Drink Name,Price,Caffeine,Calories,Size,Sugar,Artificial Sweetener,Cold,Energy Drink,Coffee,Tea
35,Campus Market,Starbucks Pike Place Roast,2.45,155.0,5,12.0,0,0,0,0,1,0
41,Starbucks,Espresso,2.95,150.0,5,1.5,0,0,0,0,1,0
44,Starbucks,Espresso Macchiato,3.05,150.0,15,1.5,0,0,0,0,1,0
52,Starbucks,Iced Espresso,2.95,150.0,5,1.5,0,0,1,0,1,0
61,Starbucks,Royal English Breakfast Tea,3.45,40.0,0,16.0,0,0,0,0,0,1
