In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline 

## Initial Overview of Data
I plan to parse the data and view it in its most raw form.

In [2]:
init_data = pd.read_csv("data/train.csv")

In [3]:

print("Length of dataframe before duplicates are removed:", len(init_data))
init_data.head()

Length of dataframe before duplicates are removed: 175000


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027
1,France,"A solid, chunky wine, with a structure that is...",,88.041695,28.0,Bordeaux,Lalande de Pomerol,,,,,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier,71079
2,France,"This is powerful and concentrated, with the hi...",,94.085021,130.0,Bordeaux,Saint-Émilion,,,,,BORDEAUX-STYLE RED BLEND,Château Figeac,32440
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34.0,California,Santa Barbara County,Central Coast,,,Jaffurs 2010 Thompson Vineyard Petite Sirah (S...,PETITE SIRAH,Jaffurs,124405
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24.0,Washington,Horse Heaven Hills,Columbia Valley,Sean P. Sullivan,@wawinereport,Syncline 2016 McKinley Springs Vineyard Rosé (...,ROSé,Syncline,33649


### Drop Duplicates and NaNs (nulls)
I need to drop the duplicates and null values from the data.

In [4]:
parsed_data = init_data[init_data.duplicated('description', keep=False)]
print("Length of dataframe after duplicates are removed:", len(parsed_data))

"""
Hey man be lazy and stick to 3 features first decription, points and price
Come back to this place to pick say the variety feature.
"""
parsed_data.dropna(subset=['description', 'points', 'price'])
print("Length of dataframe after NaNs are removed:", len(parsed_data))

parsed_data.head()

Length of dataframe after duplicates are removed: 92017
Length of dataframe after NaNs are removed: 92017


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,id
0,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,20.0,Alentejano,,,,,,PORTUGUESE RED,J. Portugal Ramos,32027
1,France,"A solid, chunky wine, with a structure that is...",,88.041695,28.0,Bordeaux,Lalande de Pomerol,,,,,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier,71079
2,France,"This is powerful and concentrated, with the hi...",,94.085021,130.0,Bordeaux,Saint-Émilion,,,,,BORDEAUX-STYLE RED BLEND,Château Figeac,32440
3,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,34.0,California,Santa Barbara County,Central Coast,,,Jaffurs 2010 Thompson Vineyard Petite Sirah (S...,PETITE SIRAH,Jaffurs,124405
4,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,24.0,Washington,Horse Heaven Hills,Columbia Valley,Sean P. Sullivan,@wawinereport,Syncline 2016 McKinley Springs Vineyard Rosé (...,ROSé,Syncline,33649


## First Data Look

Let's take a look at our data "description" vs "points":

In [5]:
dp = parsed_data[['description','points', 'price']]
dp.info()
dp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92017 entries, 0 to 174998
Data columns (total 3 columns):
description    92017 non-null object
points         92017 non-null float64
price          92017 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.8+ MB


Unnamed: 0,description,points,price
0,This is a fine rich balanced wine. It has ripe...,88.870874,20.0
1,"A solid, chunky wine, with a structure that is...",88.041695,28.0
2,"This is powerful and concentrated, with the hi...",94.085021,130.0
3,"Rich, ripe and oaky, this Petite Sirah charms ...",89.869797,34.0
4,This wine is a unique in the state blend and f...,89.017651,24.0


## Taking a look at Description length vs points

In [6]:
dp = dp.assign(description_length = dp['description'].apply(len))
dp.info()
dp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92017 entries, 0 to 174998
Data columns (total 4 columns):
description           92017 non-null object
points                92017 non-null float64
price                 92017 non-null float64
description_length    92017 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.5+ MB


Unnamed: 0,description,points,price,description_length
0,This is a fine rich balanced wine. It has ripe...,88.870874,20.0,267
1,"A solid, chunky wine, with a structure that is...",88.041695,28.0,225
2,"This is powerful and concentrated, with the hi...",94.085021,130.0,199
3,"Rich, ripe and oaky, this Petite Sirah charms ...",89.869797,34.0,187
4,This wine is a unique in the state blend and f...,89.017651,24.0,290


## Simplifying the model

Having too many different possibility for "points" would burden our model.
A 90 points wine is not that different from a 91 points wine, the description is probably not that different also. We can this throughout the descriptions' length as well.

Let's try to simplify the model with 5 different values:

1 -> Points 80 to 84 (Under Average wines)

2 -> Points 84 to 88 (Average wines)

3 -> Points 88 to 92 (Good wines)

4 -> Points 92 to 96 (Very Good wines)

5 -> Points 96 to 100 (Excellent wines)

In [7]:
#Transform method taking points as param
def transform_points_simplified(points):
    if points < 84:
        return 1
    elif points >= 84 and points < 88:
        return 2 
    elif points >= 88 and points < 92:
        return 3 
    elif points >= 92 and points < 96:
        return 4 
    else:
        return 5

#Applying transform method and assigning result to new column "points_simplified"
dp = dp.assign(points_simplified = dp['points'].apply(transform_points_simplified))
dp.head()

Unnamed: 0,description,points,price,description_length,points_simplified
0,This is a fine rich balanced wine. It has ripe...,88.870874,20.0,267,3
1,"A solid, chunky wine, with a structure that is...",88.041695,28.0,225,3
2,"This is powerful and concentrated, with the hi...",94.085021,130.0,199,4
3,"Rich, ripe and oaky, this Petite Sirah charms ...",89.869797,34.0,187,3
4,This wine is a unique in the state blend and f...,89.017651,24.0,290,3


## Description Vectorization

Let's see what CountVectorizer output on this collection

In [8]:
X = dp['description']
y = dp['price']

vectorizer = CountVectorizer()
vectorizer.fit(X)
#print(vectorizer.vocabulary_)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

Let's vectorize X based on the trained data.

In [9]:
X = vectorizer.transform(X)
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero occurrences: ', X.nnz)
# Percentage of non-zero values
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))

Shape of Sparse Matrix:  (92017, 21513)
Amount of Non-Zero occurrences:  3166279
Density: 0.15994848589012498


## Training data and test data

90% of the dataset will be used for training. 10% of the dataset will be used for testing.

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Training the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

poly = PolynomialFeatures(degree=3)
X_ = poly.fit_transform(X_train)
predict_ = poly.fit_transform(X_test)

clf = linear_model.LinearRegression(n_jobs=4).fit(X_, y_train)

# Testing the model
X_test_ = poly.fit_transform(X_test) #transform the test
predictions = clf.predict(X_test_)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(rmse)
print(r2)


Multiple hypothesis to improve this model: 
1. We could try a different vectorizer (TfidfVectorizer for example)
2. We could try cleaning the dataset using stopwords
3. We could add other metrics like price, description length etc... to our model

## Trying TfidfVectorizer

TfidfVectorizer has some advantages over the simpler CountVectorizer.

CountVectorizer just counts the word frequencies. That's all.

With TfidfVectorizer the value increases proportionally to count, but is offset by the frequency of the word in the total corpus. This is called the IDF (inverse document frequency part).
This allow the Vectorizer to adjust with frequent words like "the", "a" etc...

In [21]:
dp.head(5)

Unnamed: 0,description,points,price,description_length,points_simplified
0,This is a fine rich balanced wine. It has ripe...,88.870874,20.0,267,3
1,"A solid, chunky wine, with a structure that is...",88.041695,28.0,225,3
2,"This is powerful and concentrated, with the hi...",94.085021,130.0,199,4
3,"Rich, ripe and oaky, this Petite Sirah charms ...",89.869797,34.0,187,3
4,This wine is a unique in the state blend and f...,89.017651,24.0,290,3


""" But before the 3 features, lets see how working only with the description performs on a model.

In [22]:
X = dp['description']
y = dp['price']

# Vectorizing model
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [23]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)


In [None]:
poly = PolynomialFeatures(degree=4)
X_ = poly.fit_transform(X_train)
predict_ = poly.fit_transform(X_test)

clf = linear_model.LinearRegression(n_jobs=4)

clf.fit(X_, y_train)
print(clf.predict(predict_))

In [None]:
# Testing model
predict_ = poly.fit_transform(X_test)

predictions = clf.predict(predict_)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(rmse)
print(r2)
