# Wine Score Predictor

This solution attempts to predict the scores of wines based on the description. This predictor uses TF-IDF (Term Frequency-Inverse Document Frequency) weighting method and KNN, both implemented in pure Python.

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy import sparse
from matplotlib import pyplot as plt
from string import punctuation
from nltk.stem import WordNetLemmatizer
from sparselsh import LSH
import nltk
# Uncomment on the first run
#nltk.download('wordnet')

## Getting the data and visualizing it a little bit
---

In [3]:
raw_data = pd.read_json('./data/winemag-data-130k-v2.json')
raw_data.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
raw_data.describe()

Unnamed: 0,points,price
count,129971.0,120975.0
mean,88.447138,35.363389
std,3.03973,41.022218
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


In [5]:
columns_with_nan = [column for column in raw_data.columns 
                        if raw_data[column].isnull().any()]
columns_with_nan

['country',
 'designation',
 'price',
 'province',
 'region_1',
 'region_2',
 'taster_name',
 'taster_twitter_handle',
 'variety']

---

Description and points columns don't seem to contain any NaN's, so we can continue without worries.

---

## Initializing required funtions

`lemmatize(description)` function takes a list (here a description), and returns it in lemmatized form.

For example, consider this list: 
```python 
['dogs', 'churches', 'is']
```

Function ```lemmatize(description)``` would return the previous list in the following form: 
```python 
['dog', 'church', 'is']
```
---

In [6]:
def lemmatize(description):
    '''Lemmatizes a given list and returns it'''
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word in description:
        lemmatized.append(lemmatizer.lemmatize(word))
        
    return lemmatized    

---
```get_corpus(raw_descriptions``` and ```get_descriptions(raw_descriptions``` both are helper functions to help clean the data into nice lemmatized, and punctuationless form for further use.

---

In [7]:
def get_corpus(raw_descriptions):
    raw_descriptions_string = ' '.join(raw_descriptions).lower()
    descriptions_string = ''.join([c if c not in punctuation else ' ' for c in raw_descriptions_string])
    corpus = list(filter(lambda x: len(x) > 0, descriptions_string.split(' ')))
    corpus = lemmatize(corpus)
    
    return corpus

In [8]:
def get_descriptions(raw_descriptions):
    '''Returns a list of descriptions that have been stripped of punctuation marks and lemmatized'''
    descriptions = []
    for d in raw_descriptions:
        description_string = ''.join([c if c not in punctuation else ' ' for c in d]).lower()
        description = list(filter(lambda x: len(x) > 0, description_string.split(' ')))
        description = lemmatize(description)
        descriptions.append(description)
        
    return descriptions

---
```get_unique_words(corpus, X)``` returns a list of unique words, known as vocabulary in TF-IDF's context.

---

In [9]:
def get_unique_words(corpus):
    '''Returns vocabulary, which is a list that only consists of unique words'''
    vocabulary = []
    for word in corpus:
        if word not in vocabulary:
                vocabulary.append(word)
            
    return vocabulary

---
```get_term_frequencies(train_X, vocabulary, vocabulary_indexes, rows, columns)``` returns the term frequencies in matrix form.

---

In [10]:
def get_term_frequencies(train_X, vocabulary, vocabulary_indexes, rows, columns):
    '''Returns a matrix full of term frequencies'''
    # Matrix that has as many rows as there are documents, and as many columns as there are words in the vocabulary
    tfs = sparse.lil_matrix((rows, columns))
    for description_index, description in enumerate(train_X):
        for word in description:
            if word in vocabulary:
                word_index = vocabulary_indexes[word]
                tfs[description_index, word_index] += 1
    return tfs

---
```get_inverse_document_frequencies(train_tfs, vocabulary_indexes, rows, columns)```  returns the inverse document frequencies in list form.

---

In [11]:
def get_inverse_document_frequencies(train_tfs, vocabulary_indexes, rows, columns):
    '''Returns a list of inverse document frequencies'''
    idfs = []
    # N = total number of descriptions
    N = columns
    for column in range(columns):
        #n_t = number of descriptions, that contain the term t
        n_t = 0
        for row in range(rows):
            if train_tfs[row, column] > 0:
                n_t += 1
                
        # Inverse Document Frequency = log(N/n_t)
        idf = np.log(N/n_t)
        idfs.append(idf)

    return idfs

In [12]:
raw_descriptions = raw_data['description']
descriptions_corpus = get_corpus(raw_descriptions)

In [13]:
# descriptions = group of descriptions
X = get_descriptions(raw_descriptions)[:10000]
y = raw_data['points'][:10000]
y = [int(point) for point in y]

train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [14]:
# train_vocabulary = unique words present in the train_X
train_vocabulary = get_unique_words(reduce(lambda x, y: x + y, train_X, []))
vocabulary_indexes = dict(map(lambda word: (word[1], word[0]), enumerate(train_vocabulary)))

# n_dimensions = number of dimensions
n_dimensions = len(train_vocabulary)

---

Since TF-IDF matrix contains so many zeroes, it would approximately require 47GB of ram to store it. This is why I decided to use sparse matrix instead.

---

In [15]:
# tf = Term Frequency, e.g. raw frequency of the term t in description d
train_tfs = get_term_frequencies(train_X, train_vocabulary, vocabulary_indexes, len(train_X), n_dimensions)

# idf = Inverse Document Frequency, e.g. weighted value based on the frequency in the whole corpus
idfs = get_inverse_document_frequencies(train_tfs, vocabulary_indexes, len(train_X), n_dimensions)

train_tf_idf = sparse.csr_matrix(train_tfs.multiply(np.array([idfs])))
train_tf_idf.shape

(8000, 10162)

In [16]:
print(train_tf_idf[0])

  (0, 0)	3.0652032304586005
  (0, 1)	5.671062490664264
  (0, 2)	1.3087009185931995
  (0, 3)	4.328570752202766
  (0, 4)	1.5656893418262618
  (0, 5)	2.2755957837110934
  (0, 6)	4.382283905745161
  (0, 7)	2.3752256246599344
  (0, 8)	1.3967801630034844
  (0, 9)	2.8311489540382277
  (0, 10)	3.6060096864365274
  (0, 11)	0.9131853602021226
  (0, 12)	3.4395131707869697
  (0, 13)	4.011474794544692
  (0, 14)	1.4779505282539802
  (0, 15)	0.45240695195274583
  (0, 16)	1.471071739307176
  (0, 17)	1.6498007851806396
  (0, 18)	0.9482362612099396
  (0, 19)	2.2398440927472505
  (0, 20)	4.033453701263467
  (0, 21)	2.495392451671594
  (0, 22)	1.4507148022384317
  (0, 23)	1.5601886264909521
  (0, 24)	4.611290035312417
  (0, 25)	2.5443019547038683
  (0, 26)	4.249676809733103
  (0, 27)	4.235977965374941
  (0, 28)	5.294584919429352
  (0, 29)	4.170330859606373
  (0, 30)	3.280989943547102
  (0, 31)	3.559983864041245
  (0, 32)	1.2290837291555796
  (0, 33)	5.2374265055894025
  (0, 34)	0.5088921795039103
  (0, 35

In [17]:
validation_tfs = get_term_frequencies(validation_X, train_vocabulary, vocabulary_indexes, len(validation_X), n_dimensions)

validation_tf_idf = sparse.csr_matrix(validation_tfs.multiply(np.array([idfs])))

## Calculating distances between vectors

I decided to use a locality-sensitive hashing engine to do the random binary projections, in order to store the vectors, and calculate distances between them efficiently. Later on, I've implemented brute force euclidean distance calculator in case the number of neighbors is less than k when querying the LSH engine.

In [18]:
hyper_plane_count = 1
hash_layers = 8
engine = LSH(hyper_plane_count, train_tf_idf.shape[1], hash_layers)

for vector_id in range(train_tf_idf.shape[0]):
    vector = train_tf_idf.getrow(vector_id)
    engine.index(vector, train_y[vector_id])

In [19]:
def get_K_nearest_neighbours(vector, k):
    '''Brute force approach to get k nearest neighbours'''
    distances = []
    for vector_id in range(validation_tf_idf.shape[0]):
        current_vector = validation_tf_idf[vector_id]
        label = train_y[vector_id]
        sum_of_squares = 0
        for i in range(validation_tf_idf.shape[1]):
            sum_of_squares += np.square(vector[i] - current_vector[i])
        distance = np.sqrt(sum_of_squares)
        distances.append(([current_vector, label], distance))
        
        if len(distances) > k:
            distances.sort(key = lambda x: x[1])
            distances = distances[:k]
    
    distances.sort(key = lambda x: x[1])
    
    k_closest_neighbours = []
    for i in range(k):
        k_closest_neighbours.append(distances[i][0])

    return k_closest_neighbours

In [20]:
def get_predictions(X, k):
    predictions = []
    for vector in X:
        neighbours = engine.query(vector, num_results = k)
        neighbours.sort(key = lambda neighbour: neighbour[1])
        nearest_points = list(map(lambda neighbour: neighbour[0][1], neighbours))
        nearest_points = [int(point) for point in nearest_points]
        if len(neighbours) < k:
            neighbours = get_K_nearest_neighbours(vector, k)
            nearest_points = list(map(lambda neighbour: neighbour[1], neighbours))

        prediction = sum(nearest_points) / len(nearest_points)

        predictions.append(prediction)
    
    return predictions

## Predictions
Since I don't have any test data (this dataset didn't include any competitions), validation data will have to do.

-- Calculations haven't finished so far, I'll update this when they're done. --

In [None]:
predictions = get_predictions(validation_tf_idf, 3)
print("r2 score: ", r2_score(validation_y, predictions))

plt.scatter(validation_y, predictions)