In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt

df = pd.read_csv("wine-reviews.csv")
df['alcohol_int'] = df.alcohol.str.extract("(\d+)", expand=False).dropna().astype(int)
df['price_int'] = df.price.str.extract("(\d+)", expand=False).dropna().astype(int)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
#clean up data
df.dropna(subset = ['price_int', 'wine_desc'], how = 'any', inplace = True)

#token pattern changed from '\b\w\w+\b' (default) to '\b[a-zA-Z][a-zA-Z]+\b' to avoid data leak.
#If the price were in the review, that would be easy to classify.
vec = CountVectorizer(stop_words = 'english', token_pattern=r'\b[a-zA-Z][a-zA-Z]+\b')
vec.fit(df['wine_desc'])
X = vec.transform(df['wine_desc'])
y = df['price_int']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3131, 6166) (783, 6166) (3131,) (783,)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  mean_squared_error, r2_score

lr = LinearRegression(fit_intercept = 0)
lr.fit(X_train, y_train)

print('R2 on train:', r2_score(y_train, lr.predict(X_train)), 
      '\nMean Squared Error on train:', mean_squared_error(y_train, lr.predict(X_train)))

print('R2 on test:', r2_score(y_test, lr.predict(X_test)), 
      '\nMean Squared Error on test:', mean_squared_error(y_test, lr.predict(X_test)))

R2 on train: 0.999999999945 
Mean Squared Error on train: 4.68084469366e-08
R2 on test: -0.846137875035 
Mean Squared Error on test: 1298.29550531


This classifier is great for interpolation, ie predicting prices based on reviews which lie within the set of reviews the model was trained on.  This is due to the varied vocabulary of the sommelier.  The model fails to extrapolate to the test set, also due to the varied nature of the vocabulary.  A negative R2 score reported by sklearn means the model performs arbitrarily worse than the mean price predictor.

In [8]:
def print_top_words(model, feature_names, n_top_words):
    print('\n--------------------------------\n')
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % (topic_idx+1))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
n = 10        
bottom_n = np.argsort(lr.coef_)[n::-1]
top_n = np.argsort(lr.coef_)[-n:]

feature_names = vec.get_feature_names()

print('Largest Contributors to High Price\n')
print("\n".join(feature_names[i] + " " + str(lr.coef_[i]) for i in top_n))

print('\n\nLargest Contributors to Low Price\n')
print("\n".join(feature_names[i] + " " + str(lr.coef_[i]) for i in bottom_n))

Largest Contributors to High Price

alpine 64.2668179491
seriously 69.8956331282
individual 70.2345560249
powerhouse 72.9926056999
hallowed 74.7463065432
ability 78.4165984052
constructed 79.7706834233
blockbuster 92.5654515964
parcel 96.3371734782
maintaining 107.994797587


Largest Contributors to Low Price

falls -39.2061883452
amber -39.6222340231
oodles -40.0331896038
monolithic -41.478531196
paso -41.5149207827
twice -42.5507793016
soaks -43.5891150345
radiant -44.2865491428
requisite -44.9345115271
coolly -48.6337853171
metallic -58.2305545734
