In [97]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import random

In [98]:
# Import the DF created by Antonio where we get the address, lat and lon for each row

geolocation_df = pd.read_csv('data/latlon_wine_data.csv')

In [99]:
# Creating an extra column to add the address of each row in the original dataset, so that it can be used as key for the merge in the following step

expanded_df = pd.read_csv('data/expanded_dataframe.csv')
expanded_df['address'] = expanded_df['region'] + ', ' + expanded_df['province'] + ', ' + expanded_df['country']

In [100]:
# Check if the column was added correctly
expanded_df.columns

Index(['Unnamed: 0', 'country', 'description', 'points', 'price', 'province',
       'region', 'title', 'variety', 'winery', 'processed_description',
       'dry_wine', 'sweet_wine', 'fruity_aroma', 'spicy_aroma', 'herb_aroma',
       'oak_aroma', 'chocolate_aroma', 'floral_aroma', 'body_light',
       'body_medium', 'body_full', 'soft_tex', 'creamy_tex', 'structured_tex',
       'silky_tex', 'wine_type', 'address'],
      dtype='object')

In [101]:
# Merge/join the two dataframes on the address, so that for each wine we have speicfic lat and lon and we can use it later on in the UI to show the location
# of the recommended wines visually in a map

expanded_df= expanded_df.merge(geolocation_df[['address','lat','lon']], how = 'left', on='address' )
expanded_df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
0,0,US,The vineyard is one of the better Chardonnay s...,92,36.0,California,"Alexander Valley, Sonoma",Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,chardonnay,Matrix,...,0,1,0,0,0,0,white,"Alexander Valley, Sonoma, California, US",38.612965,-122.769435
1,1,US,Defines Rockpile Zinfandel in intensity of fru...,92,39.0,California,"Rockpile, Sonoma",Mauritson 2007 Rockpile Cemetary Vineyard Zinf...,zinfandel,Mauritson,...,0,1,0,0,0,0,red,"Rockpile, Sonoma, California, US",38.608666,-122.869832
2,2,US,This sophisticated wine is mostly Cabernet Sau...,92,45.0,California,"Napa Valley, Napa",Silverado 2006 Cabernet Sauvignon (Napa Valley),cabernet sauvignon,Silverado,...,0,1,0,0,1,0,red,"Napa Valley, Napa, California, US",38.297538,-122.286865


In [102]:
# Rename the index column
expanded_df= expanded_df.rename(columns={'Unnamed: 0': 'index'})

In [103]:
# Fit the KNN model

X = expanded_df[['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full','price']]
y = expanded_df['index']

neigh = KNeighborsRegressor(n_neighbors=15)
neigh.fit(X, y)

In [104]:
test = expanded_df.iloc[[1726]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

neigh.kneighbors(test, n_neighbors=15, return_distance=True)

(array([[0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[16315,  1726, 15572, 17724,  6032,  5536,   680,  5123,  3203,
            14,  1783, 21212,  6104,   720, 17286]]))

In [105]:
# Getting the datapoints of the features of one of the neighbors
neighbor = expanded_df.iloc[[17286]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full','price']]

In [106]:
# Check the values of my datapoints
test_v = test.values

# Check the values of the neighbor's datapoints
neighbor_v = neighbor.values

In [107]:
# Dot product of the vectors of neighbor and test datapoints to check how accurate the neighbor is compared to the test feature

np.dot(neighbor_v, test_v.T)/((np.dot(neighbor_v, neighbor_v.T)**0.5)*(np.dot(test_v, test_v.T))**0.5)

array([[0.99994462]])

In [108]:
input_row = 29960 #np.random.randint(0, 55350)
test = expanded_df.iloc[[input_row]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

n_datapoints = neigh.kneighbors(test, n_neighbors=15, return_distance=True)
n_datapoints

(array([[0.        , 1.        , 1.        , 1.        , 1.        ,
         1.        , 1.        , 1.        , 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356]]),
 array([[29960, 29785, 46375, 30414, 51510,  5030, 51289,  8410,  3120,
          7782,  4111, 29997,  1014, 30176,  3337]]))

In [109]:
expanded_df.iloc[[29960]]

Unnamed: 0,index,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
29960,29960,Italy,Prinsi's Gaia Principe cru is located very nea...,90,52.0,Piedmont,"Barbaresco,",Prinsi 2004 Gaia Principe (Barbaresco),nebbiolo,Prinsi,...,0,0,1,0,0,0,red,"Barbaresco, , Piedmont, Italy",44.723262,8.083557


In [110]:
def match_type(x: int):
    '''Input the index of a wine and returns a dataframe whose rows are the
    wine's nearest neighbors whose wine type matches our original wine'''
    # Finding the nearest neighbors
    input = expanded_df.iloc[[x]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma',
                                   'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

    nearest_neighbors = neigh.kneighbors(input, n_neighbors=15, return_distance=True)
    neighs = nearest_neighbors[1][0].tolist()

    # Matching the wine type
    wine_type = expanded_df.iloc[x]['wine_type']

    neighbors = expanded_df.iloc[neighs]
    matching_neighbors = neighbors[neighbors['wine_type'] == wine_type]

    return matching_neighbors

In [111]:
match_type(input_row)

Unnamed: 0,index,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
29960,29960,Italy,Prinsi's Gaia Principe cru is located very nea...,90,52.0,Piedmont,"Barbaresco,",Prinsi 2004 Gaia Principe (Barbaresco),nebbiolo,Prinsi,...,0,0,1,0,0,0,red,"Barbaresco, , Piedmont, Italy",44.723262,8.083557
29785,29785,Italy,You'll love this traditional expression of Bar...,90,52.0,Piedmont,"Barbaresco,",Giacosa Fratelli 2007 Basarin (Barbaresco),nebbiolo,Giacosa Fratelli,...,0,0,0,0,0,1,red,"Barbaresco, , Piedmont, Italy",44.723262,8.083557
46375,46375,Argentina,he bouquet smells jammy and not that exciting....,88,52.0,Other,"Río Negro Valley,",Bodega Noemía de Patagonia 2012 J. Alberto Sin...,malbec,Bodega Noemía de Patagonia,...,0,0,0,0,0,0,red,"Río Negro Valley, , Other, Argentina",-40.734434,-66.617645
30414,30414,Italy,"This opens with aromas suggesting resin, overr...",88,52.0,Piedmont,"Barbaresco,",Pietro Rinaldi 2013 San Cristoforo (Barbaresco),nebbiolo,Pietro Rinaldi,...,0,0,0,0,0,0,red,"Barbaresco, , Piedmont, Italy",44.723262,8.083557
5030,5030,US,"A powerful, compelling young Pinot Noir notabl...",94,52.0,California,"Green Valley, Sonoma",Dutton Estate 2008 Dutton-Thomas Road Vineyard...,pinot noir,Dutton Estate,...,0,0,0,0,1,0,red,"Green Valley, Sonoma, California, US",38.292331,-122.459193
3120,3120,US,"Whole-cluster fermented, this wine offers comp...",92,52.0,California,"Russian River Valley, Sonoma",Marimar Estate 2012 Método Antiguo Doña Margar...,pinot noir,Marimar Estate,...,0,0,0,0,1,0,red,"Russian River Valley, Sonoma, California, US",38.483642,-122.817533
7782,7782,US,Classic Cambria Pinot from the Santa Maria Val...,94,52.0,California,"Santa Maria Valley, Central Coast",Cambria 2010 Estate Grown & Bottled Clone No. ...,pinot noir,Cambria,...,0,1,0,0,0,1,red,"Santa Maria Valley, Central Coast, California, US",34.966366,-120.641562
4111,4111,US,Barnett is better known as a producer of very ...,93,52.0,California,"Anderson Valley,",Barnett 2006 Savoy Vineyard Pinot Noir (Anders...,pinot noir,Barnett,...,0,0,0,0,0,0,red,"Anderson Valley, , California, US",39.086566,-123.479454
29997,29997,Italy,Ripe del Falco is a successful experiment in t...,88,52.0,Southern Italy,"Cirò Classico,",Ippolito 1845 1995 Ripe del Falco Riserva Supe...,gaglioppo,Ippolito 1845,...,0,0,0,0,0,0,red,"Cirò Classico, , Southern Italy, Italy",39.381169,17.067309
1014,1014,US,"Baked red plum, black cherry, sagebrush, graph...",92,52.0,California,"Monterey, Central Coast",Talbott 2013 Diamond T Vineyard Estate Grown P...,pinot noir,Talbott,...,0,1,0,0,0,0,red,"Monterey, Central Coast, California, US",36.600238,-121.894676


In [114]:
len(match_type(input_row))

12

In [122]:
def describe(x: pd.DataFrame):
    '''Returns the descriptions of a df of wines'''
    for i in range(len(x)):
        print(f'Wine number {i+1}:\n')
        print(f"This is a {x.iloc[i]['variety']} from {x.iloc[i]['country']}\n")
        print(x.iloc[i]['description'])
        print('-'*100)

In [123]:
describe(match_type(input_row))

Wine number 1:

This is a nebbiolo from Italy

Prinsi's Gaia Principe cru is located very near the Gallina vineyard and shows slightly softer and less intense aromas. Winemaking plays a big role here: This is an oak-driven wine that melds supple aromas of vanilla and sweet spice over a lively base of red fruit. There's also a mineral note of chalk or wet stone that lingers long on the finish.
----------------------------------------------------------------------------------------------------
Wine number 2:

This is a nebbiolo from Italy

You'll love this traditional expression of Barbaresco and its excellent balancing act between the elegance of its natural fruit aromas and the spicy oak-related tones that embellish it. There's a touch of raw fruit or citrus zest as well and the wine closes in a silky, smooth fashion.
----------------------------------------------------------------------------------------------------
Wine number 3:

This is a malbec from Argentina

he bouquet smells ja