In [97]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import random

In [160]:
# Import the DF created by Antonio where we get the address, lat and lon for each row

geolocation_df = pd.read_csv('data/latlon_wine_data.csv')

In [161]:
# Creating an extra column to add the address of each row in the original dataset, so that it can be used as key for the merge in the following step

expanded_df = pd.read_csv('data/expanded_dataframe.csv')
expanded_df['address'] = expanded_df['region'] + ', ' + expanded_df['province'] + ', ' + expanded_df['country']

In [162]:
# Check if the column was added correctly
expanded_df.columns

Index(['Unnamed: 0', 'country', 'description', 'points', 'price', 'province',
       'region', 'title', 'variety', 'winery', 'processed_description',
       'dry_wine', 'sweet_wine', 'fruity_aroma', 'spicy_aroma', 'herb_aroma',
       'oak_aroma', 'chocolate_aroma', 'floral_aroma', 'body_light',
       'body_medium', 'body_full', 'soft_tex', 'creamy_tex', 'structured_tex',
       'silky_tex', 'wine_type', 'address'],
      dtype='object')

In [163]:
# Merge/join the two dataframes on the address, so that for each wine we have speicfic lat and lon and we can use it later on in the UI to show the location
# of the recommended wines visually in a map

expanded_df= expanded_df.merge(geolocation_df[['address','lat','lon']], how = 'left', on='address' )
expanded_df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
0,0,US,The vineyard is one of the better Chardonnay s...,92,36.0,California,"Alexander Valley, Sonoma",Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,chardonnay,Matrix,...,0,1,0,0,0,0,white,"Alexander Valley, Sonoma, California, US",38.612965,-122.769435
1,1,US,Defines Rockpile Zinfandel in intensity of fru...,92,39.0,California,"Rockpile, Sonoma",Mauritson 2007 Rockpile Cemetary Vineyard Zinf...,zinfandel,Mauritson,...,0,1,0,0,0,0,red,"Rockpile, Sonoma, California, US",38.608666,-122.869832
2,2,US,This sophisticated wine is mostly Cabernet Sau...,92,45.0,California,"Napa Valley, Napa",Silverado 2006 Cabernet Sauvignon (Napa Valley),cabernet sauvignon,Silverado,...,0,1,0,0,1,0,red,"Napa Valley, Napa, California, US",38.297538,-122.286865


In [164]:
# Rename the index column
expanded_df= expanded_df.rename(columns={'Unnamed: 0': 'index'})

In [165]:
# Fit the KNN model

X = expanded_df[['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full','price']]
y = expanded_df['index']

neigh = KNeighborsRegressor(n_neighbors=15)
neigh.fit(X, y)

In [166]:
test = expanded_df.iloc[[1726]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

neigh.kneighbors(test, n_neighbors=15, return_distance=True)

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 array([[17308, 18457, 15428, 18109, 15754, 15572, 16318, 17724, 15052,
         12596,   393,   392,  1726, 16315, 16220]]))

In [167]:
# Getting the datapoints of the features of one of the neighbors
neighbor = expanded_df.iloc[[17286]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full','price']]

In [168]:
# Check the values of my datapoints
test_v = test.values

# Check the values of the neighbor's datapoints
neighbor_v = neighbor.values

In [169]:
# Dot product of the vectors of neighbor and test datapoints to check how accurate the neighbor is compared to the test feature

np.dot(neighbor_v, test_v.T)/((np.dot(neighbor_v, neighbor_v.T)**0.5)*(np.dot(test_v, test_v.T))**0.5)

array([[0.99994462]])

In [170]:
input_row = np.random.randint(0, 55350)
test = expanded_df.iloc[[input_row]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

n_datapoints = neigh.kneighbors(test, n_neighbors=15, return_distance=True)
n_datapoints

(array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.]]),
 array([[11522, 12864, 55030, 24219, 54982, 13792, 38573, 55104, 24258,
          9743, 15331, 10687, 15059, 14970,  2892]]))

In [171]:
def match_type(x: int):
    '''Input the index of a wine and returns a dataframe whose rows are the
    wine's nearest neighbors whose wine type matches our original wine'''
    # Finding the nearest neighbors
    input = expanded_df.iloc[[x]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma',
                                   'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

    nearest_neighbors = neigh.kneighbors(input, n_neighbors=15, return_distance=True)
    neighs = nearest_neighbors[1][0].tolist()

    # Matching the wine type
    wine_type = expanded_df.iloc[x]['wine_type']

    neighbors = expanded_df.iloc[neighs]
    matching_neighbors = neighbors[neighbors['wine_type'] == wine_type]

    return matching_neighbors

In [172]:
match_type(input_row)

Unnamed: 0,index,country,description,points,price,province,region,title,variety,winery,...,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type,address,lat,lon
11522,11522,US,"A tremendous Petite Sirah, among the finest fr...",93,26.0,California,"Paso Robles, Central Coast",Vina Robles 2007 Jardine Petite Sirah (Paso Ro...,petite sirah,Vina Robles,...,0,0,0,0,0,1,red,"Paso Robles, Central Coast, California, US",35.636876,-120.654502
12864,12864,US,Quivira is assemblng quite a track record with...,92,26.0,California,"Dry Creek Valley, Sonoma",Quivira 2006 Wine Creek Ranch Petite Sirah (Dr...,petite sirah,Quivira,...,0,0,0,0,0,0,red,"Dry Creek Valley, Sonoma, California, US",38.611205,-122.873454
55030,55030,Greece,The nose on this layered Merlot offers dark ch...,88,26.0,Drama,,Domaine Costa Lazaridi 2008 Château Julia Merl...,merlot,Domaine Costa Lazaridi,...,0,0,0,0,0,0,red,,,
24219,24219,Italy,"A blend of Sangiovese, with 10% Cabernet Sauvi...",91,26.0,Tuscany,"Chianti Classico,",Rocca delle Macìe 2012 Famiglia Zingarelli Ris...,red blend,Rocca delle Macìe,...,0,0,0,0,0,0,red,"Chianti Classico, , Tuscany, Italy",43.567115,10.9807
54982,54982,Greece,"Mocha, raspberry, sour cherry and spice are th...",85,26.0,Nemea,,Skouras 2008 Grande Cuvée Agiorgitiko (Nemea),agiorgitiko,Skouras,...,0,0,0,0,0,0,red,,,
13792,13792,US,This wine really captures the personality of t...,91,26.0,California,"Red Hills Lake County,",Prima Materia 2013 Mourvèdre (Red Hills Lake C...,mourvdre,Prima Materia,...,0,0,0,0,0,0,red,"Red Hills Lake County, , California, US",38.917296,-122.751033
55104,55104,Greece,"Mocha, raspberry, sour cherry and spice are th...",85,26.0,Nemea,,Skouras 2008 Grande Cuvée Agiorgitiko (Nemea),agiorgitiko,Skouras,...,0,0,0,0,0,0,red,,,
24258,24258,Italy,"A blend of Sangiovese, with 10% Cabernet Sauvi...",91,26.0,Tuscany,"Chianti Classico,",Rocca delle Macìe 2012 Famiglia Zingarelli Ris...,red blend,Rocca delle Macìe,...,0,0,0,0,0,0,red,"Chianti Classico, , Tuscany, Italy",43.567115,10.9807
9743,9743,US,No matter how hard you try to tame Petite Sira...,91,26.0,California,"Dry Creek Valley, Sonoma",Quivira 2005 Wine Creek Ranch Petite Sirah (Dr...,petite sirah,Quivira,...,0,0,0,0,0,0,red,"Dry Creek Valley, Sonoma, California, US",38.611205,-122.873454
15331,15331,France,"This 1,000-acre estate with 140 acres of vines...",90,26.0,Bordeaux,"Haut-Médoc,",Château de Malleret 2012 Haut-Médoc,bordeauxstyle red blend,Château de Malleret,...,1,0,0,0,0,0,red,"Haut-Médoc, , Bordeaux, France",44.837789,-0.57918


In [173]:
len(match_type(input_row))

13

In [174]:
def describe(x: pd.DataFrame):
    '''Returns the descriptions of a df of wines'''
    for i in range(len(x)):
        print(f'Wine number {i+1}:\n')
        print(f"This is a {x.iloc[i]['variety']} from {x.iloc[i]['country']}\n")
        print(x.iloc[i]['description'])
        print('-'*100)

In [175]:
describe(match_type(input_row))

Wine number 1:

This is a petite sirah from US

A tremendous Petite Sirah, among the finest from Paso Robles. It's a big wine, with massive flavors of blackberries, currants, leather, mocha and smoky oak, with an ultralong, spicy finish. Yet the wine is totally dry, with refined, silky tannins. Yes, the alcohol's high, but it's completely balanced. Best now and through the next decade.
----------------------------------------------------------------------------------------------------
Wine number 2:

This is a petite sirah from US

Quivira is assemblng quite a track record with this bottling, which takes Petite Sirah's irrepressible quality and pounds it into elegance and finesse. Dry and balanced, their '06 shows ripe berry, chocolate and spice flavors wrapped into firm, fine tannins. Now through 2012.
----------------------------------------------------------------------------------------------------
Wine number 3:

This is a merlot from Greece

The nose on this layered Merlot offers

In [176]:
['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma','chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']

['dry_wine',
 'sweet_wine',
 'fruity_aroma',
 'spicy_aroma',
 'herb_aroma',
 'oak_aroma',
 'chocolate_aroma',
 'floral_aroma',
 'body_light',
 'body_medium',
 'body_full',
 'price']

In [153]:
X_predict = np.array([[0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 15]])

neigh.predict(X_predict)



array([37181.33333333])

In [140]:
expanded_df.iloc[[45884]][['dry_wine', 'sweet_wine', 'fruity_aroma','spicy_aroma', 'herb_aroma', 'oak_aroma',
                                   'chocolate_aroma','floral_aroma', 'body_light', 'body_medium', 'body_full', 'price']]

Unnamed: 0,dry_wine,sweet_wine,fruity_aroma,spicy_aroma,herb_aroma,oak_aroma,chocolate_aroma,floral_aroma,body_light,body_medium,body_full,price
45884,0,1,1,1,0,0,0,0,1,0,0,15.0


In [154]:
neigh.kneighbors(X_predict)



(array([[0.        , 1.        , 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.41421356, 1.41421356, 1.41421356]]),
 array([[42759, 54749, 33969, 50978,  6452, 34407, 25439, 45033, 45019,
         22136, 34803, 33895, 53933, 43875, 30273]]))

In [159]:
expanded_df.iloc[42759]['description']

'Solid, deep and earthy smelling, with cola, mushroom, violet and tree bark aromas. The palate is smooth and balanced, with dark berry, baked plum, chocolate and spice flavors. Dry, foresty and healthy as can be on the finish. Exemplary for the price.'

In [177]:
import pickle

# Export Pipeline as pickle file
with open("model.pkl", "wb") as file:
    pickle.dump(neigh, file)
