In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import random

In [2]:
# Import the DF created by Antonio where we get the address, lat and lon for each row

geolocation_df = pd.read_csv('data/latlon_wine_data.csv')

In [3]:
# Creating an extra column to add the address of each row in the original dataset, so that it can be used as key for the merge in the following step

expanded_df = pd.read_csv('data/expanded_dataframe.csv')
expanded_df['address'] = expanded_df['region'] + ', ' + expanded_df['province'] + ', ' + expanded_df['country']

In [4]:
# Check if the column was added correctly
expanded_df.columns

Index(['Unnamed: 0', 'country', 'description', 'points', 'price', 'province',
       'region', 'title', 'variety', 'winery', 'processed_description',
       'dry_wine', 'sweet_wine', 'fruity_aroma', 'spicy_aroma', 'herb_aroma',
       'oak_aroma', 'chocolate_aroma', 'floral_aroma', 'body_light',
       'body_medium', 'body_full', 'soft_tex', 'creamy_tex', 'structured_tex',
       'silky_tex', 'wine_type', 'address'],
      dtype='object')

In [5]:
# Merge/join the two dataframes on the address, so that for each wine we have speicfic lat and lon and we can use it later on in the UI to show the location
# of the recommended wines visually in a map

expanded_df= expanded_df.merge(geolocation_df[['address','lat','lon']], how = 'left', on='address' )
expanded_df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
0,0,US,The vineyard is one of the better Chardonnay s...,92,36.0,California,"Alexander Valley, Sonoma",Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,chardonnay,Matrix,...,0,1,0,0,0,0,white,"Alexander Valley, Sonoma, California, US",38.612965,-122.769435
1,1,US,Defines Rockpile Zinfandel in intensity of fru...,92,39.0,California,"Rockpile, Sonoma",Mauritson 2007 Rockpile Cemetary Vineyard Zinf...,zinfandel,Mauritson,...,0,1,0,0,0,0,red,"Rockpile, Sonoma, California, US",38.608666,-122.869832
2,2,US,This sophisticated wine is mostly Cabernet Sau...,92,45.0,California,"Napa Valley, Napa",Silverado 2006 Cabernet Sauvignon (Napa Valley),cabernet sauvignon,Silverado,...,0,1,0,0,1,0,red,"Napa Valley, Napa, California, US",38.297538,-122.286865


In [6]:
# Rename the index column
expanded_df= expanded_df.rename(columns={'Unnamed: 0': 'index'})

In [7]:
# Fit the KNN model

X = expanded_df[['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]
y = expanded_df['index']

neigh = KNeighborsRegressor(n_neighbors=15)
neigh.fit(X, y)

In [8]:
test = expanded_df.iloc[[1726]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]

neigh.kneighbors(test, n_neighbors=15, return_distance=True)

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[201, 329, 169, 307, 134, 142,  58, 238, 115,  19, 135, 189,  23,
         202,  12]]))

In [9]:
# Getting the datapoints of the features of one of the neighbors
neighbor = expanded_df.iloc[[17286]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]

In [10]:
# Check the values of my datapoints
test_v = test.values

# Check the values of the neighbor's datapoints
neighbor_v = neighbor.values

In [11]:
# Dot product of the vectors of neighbor and test datapoints to check how accurate the neighbor is compared to the test feature

np.dot(neighbor_v, test_v.T)/((np.dot(neighbor_v, neighbor_v.T)**0.5)*(np.dot(test_v, test_v.T))**0.5)

array([[0.81649658]])

In [12]:
input_row = np.random.randint(0, 55350)
test = expanded_df.iloc[[input_row]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]

n_datapoints = neigh.kneighbors(test, n_neighbors=15, return_distance=True)
n_datapoints

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[521, 670, 383, 612, 246, 357, 508, 579, 148, 198, 272, 289, 453,
          37,  75]]))

In [13]:
def match_type(x: int):
    '''Input the index of a wine and returns a dataframe whose rows are the
    wine's nearest neighbors whose wine type matches our original wine'''
    # Finding the nearest neighbors
    input = expanded_df.iloc[[x]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma',
                                   'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]

    nearest_neighbors = neigh.kneighbors(input, n_neighbors=15, return_distance=True)
    neighs = nearest_neighbors[1][0].tolist()

    # Matching the wine type
    wine_type = expanded_df.iloc[x]['wine_type']

    neighbors = expanded_df.iloc[neighs]
    matching_neighbors = neighbors[neighbors['wine_type'] == wine_type]

    return matching_neighbors

In [14]:
match_type(input_row)

Unnamed: 0,index,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
521,521,US,"Very light pink, this rosé offers extremely pl...",92,28.0,California,"Santa Ynez Valley, Central Coast",Cebada 2014 Estate Grown Rosé of Pinot Noir (S...,pinot noir,Cebada,...,0,0,0,0,0,0,red,"Santa Ynez Valley, Central Coast, California, US",34.584154,-120.097369
670,670,US,"A study of umami, this wine shows shiitake mus...",93,35.0,California,"Monterey, Central Coast",Wrath 2012 Swan/828 Pinot Noir (Monterey),pinot noir,Wrath,...,0,0,0,0,0,1,red,"Monterey, Central Coast, California, US",36.600238,-121.894676
612,612,US,From the prestigious Rutherford appellation co...,91,36.0,California,"Rutherford, Napa",Del Bondio 2006 Cabernet Sauvignon (Rutherford),cabernet sauvignon,Del Bondio,...,0,0,1,0,0,0,red,"Rutherford, Napa, California, US",38.459101,-122.422564
508,508,US,This excellent Merlot speaks to many of Reinin...,91,39.0,Washington,"Walla Walla Valley (WA), Columbia Valley",Reininger 2010 Merlot (Walla Walla Valley (WA)),merlot,Reininger,...,0,0,0,0,0,0,red,"Walla Walla Valley (WA), Columbia Valley, Wash...",46.064581,-118.343021
272,272,US,A slightly bitter tobacco note frames black-ch...,92,75.0,Oregon,"Willamette Valley,",WillaKenzie Estate 2014 Triple Black Slopes Pi...,pinot noir,WillaKenzie Estate,...,0,0,0,0,0,0,red,"Willamette Valley, , Oregon, US",44.942554,-122.933762
453,453,US,Slightly funky scents of asphalt and sweaty so...,91,60.0,California,"Santa Cruz Mountains, Central Coast",Mount Eden Vineyards 2010 Estate Cabernet Sauv...,cabernet sauvignon,Mount Eden Vineyards,...,0,0,0,0,0,0,red,"Santa Cruz Mountains, Central Coast, Californi...",37.110892,-121.844891
75,75,US,This is one of the best balanced and most eleg...,92,30.0,California,"Mokelumne River, Central Valley",St. Amant 2013 Lodi Native Marian's Vineyard Z...,zinfandel,St. Amant,...,0,0,0,0,0,0,red,"Mokelumne River, Central Valley, California, US",40.680428,-122.370842


In [15]:
len(match_type(input_row))

7

In [16]:
def describe(x: pd.DataFrame):
    '''Returns the descriptions of a df of wines'''
    for i in range(len(x)):
        print(f'Wine number {i+1}:\n')
        print(f"This is a {x.iloc[i]['variety']} from {x.iloc[i]['country']}\n")
        print(x.iloc[i]['description'])
        print('-'*100)

In [17]:
describe(match_type(input_row))

Wine number 1:

This is a pinot noir from US

Very light pink, this rosé offers extremely pleasant and approachable aromas of nectarine, pluot, plum, lime and a dusting of vanilla. These give way to a light and bright palate, where lime zest cuts through a bubble gum core. Yeasty notes adds further nuance.
----------------------------------------------------------------------------------------------------
Wine number 2:

This is a pinot noir from US

A study of umami, this wine shows shiitake mushrooms, decomposing wood, soy and hibiscus on the nose. The palate is silky smooth yet with immediate verve, unfolding with flavors of loam, beef char, teriyaki-laced berries, smashed plums and graphite. It's focused yet expansive.
----------------------------------------------------------------------------------------------------
Wine number 3:

This is a cabernet sauvignon from US

From the prestigious Rutherford appellation comes this soft, smooth and mellow Cabernet, with delicious flavors 

In [18]:
['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma','chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']

['dry_wine',
 'sweet_wine',
 'fruity_aroma',
 'spicy_aroma',
 'herb_aroma',
 'oak_aroma',
 'chocolate_aroma',
 'floral_aroma',
 'body_light',
 'body_medium',
 'body_full']

In [20]:
X_predict = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])

neigh.predict(X_predict)



array([25587.4])

In [21]:
expanded_df.iloc[[45884]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma',
                                   'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full']]

Unnamed: 0,dry_wine,sweet_wine,fruity_aroma,spicy_aroma,herb_aroma,oak_aroma,chocolate_aroma,floral_aroma,body_light,body_medium,body_full
45884,0,0,1,1,0,0,0,0,1,0,0


In [22]:
neigh.kneighbors(X_predict)



(array([[1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.        , 1.41421356]]),
 array([[ 5292, 55212, 22506, 55349,  9542, 21994, 45074, 25747, 54666,
          5654, 11416,  1842, 42759,   549, 26209]]))

In [23]:
expanded_df.iloc[42759]['description']

'Solid, deep and earthy smelling, with cola, mushroom, violet and tree bark aromas. The palate is smooth and balanced, with dark berry, baked plum, chocolate and spice flavors. Dry, foresty and healthy as can be on the finish. Exemplary for the price.'

In [24]:
import pickle

# Export Pipeline as pickle file
with open("model.pkl", "wb") as file:
    pickle.dump(neigh, file)
