In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import RegexpTokenizer
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

wine = pd.read_csv('winemag-data_first150k.csv', index_col=0, encoding="utf-8", engine="python")
wine = wine[['description', 'price', 'points']]
wine.sample(3)

def make_lower_case(text):
    return text.lower()

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

wine["description"] = wine["description"].str.replace('\d+', '')
wine["description"] = wine.description.apply(func=remove_punctuation)
wine["description"] = wine.description.apply(func=make_lower_case)

#Train tfidf and svd
tf = TfidfVectorizer(analyzer='word', 
                     min_df=10,
                     ngram_range=(1, 2),
                     stop_words='english')
svd = TruncatedSVD(n_components=5)

#Fit tfidf and svd, and transform training data
tfidf_matrix = tf.fit_transform(wine.description)
lsa_features = pd.DataFrame(svd.fit_transform(tfidf_matrix))

#Creat meaningful column names
collist = map(str, range(0, 5))
collist = ["latent_description_" + s for s in collist]
lsa_features.columns = collist
lsa_features.head()

#Make atest train split
X = pd.concat([lsa_features, wine["price"]], axis=1)
y = wine["points"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Format data
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#Set xgboost parameters
param = {'max_depth': 3, 'eta': 1, 'subsample':0.5, 'alpha':1}
param['nthread'] = 4
param['eval_metric'] = 'mae'
param['objective'] = 'reg:linear'
param['silent'] = 1
evallist = [(dtrain, 'train')]
num_round = 10

#Train model
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)

pred = pd.DataFrame(y_test.copy())
pred["prediction"] = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

mean_ae = np.round(mean_absolute_error(y_true=pred["points"], y_pred=pred["prediction"]),2)
median_ae = np.round(median_absolute_error(y_true=pred["points"], y_pred=pred["prediction"]),2)
print ("Mean Absolute Error: {} points".format(mean_ae))


xgb.plot_importance(bst,max_num_features=15)

[0]	train-mae:2.12114
Will train until train-mae hasn't improved in 10 rounds.
[1]	train-mae:2.02557
[2]	train-mae:1.96304
[3]	train-mae:1.93751
[4]	train-mae:1.9197
[5]	train-mae:1.90718
[6]	train-mae:1.89269
[7]	train-mae:1.88126
[8]	train-mae:1.87098
[9]	train-mae:1.86809
Mean Absolute Error: 1.88 points
