In [1]:
from collections import Counter
import numpy as np
import nltk
import re
import sklearn.manifold
import multiprocessing
import pandas as pd
import gensim
import gensim.models.word2vec as w2v
import xgboost as xgb
from sklearn import linear_model



In [2]:
data = pd.read_csv('winemag-data_first150k.csv')
data=data.dropna(subset=['price'])
data=data[data.price<200]

In [3]:
labels = data['variety']
descriptions = data['description']

In [72]:
mean = sum([int(d['price']) for a,d in data.iterrows()])/len(data)
variance = sum([(int(d['price']) - mean)**2 for a,d in data.iterrows()])/len(data)

print('Mean: ', mean, ' Variance: ', variance)


Mean:  31.272754605185966  Variance:  559.8094559754887


In [4]:
corpus_raw = ""
for description in descriptions:
    corpus_raw += description

In [5]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [8]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [9]:
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 5,650,564 tokens


In [10]:
num_features = 50
min_word_count = 10
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed=1993

In [11]:
wine2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [12]:
wine2vec.build_vocab(sentences)

In [13]:
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))
print(wine2vec.corpus_count)

Word2Vec vocabulary length: 10970
241132


In [14]:
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)

20361606

In [None]:
index = gensim.matutils.Dense2Corpus(wine2vec.wv.syn0); 

In [15]:
provinces = set()
regions = set()
varieties = set()
wineries = set()

for a,d in data.iterrows():
    provinces.add(d['province'])
    regions.add(d['region_1'])
    varieties.add(d['variety'])
    wineries.add(d['winery'])
    
    
province_dict = {p: i for p,i in zip(provinces, range(len(provinces)))}
region_dict = {r: i for r,i in zip(regions, range(len(regions)))}
variety_dict = {v: i for v,i in zip(varieties, range(len(varieties)))}
winery_dict = {w: i for w,i in zip(wineries, range(len(wineries)))}

In [35]:
def feature2(datum):
    feat = [0] * (len(provinces) + len(regions)+ len(varieties))
    feat[province_dict[datum['province']]] = 1
    feat[region_dict[datum['region_1']]+len(provinces)] = 1
    feat[variety_dict[datum['variety']]+len(provinces)+len(regions)] = 1
#     feat[winery_dict[datum['winery']]+len(provinces)+len(regions)+len(varieties)] = 1
    feat.append(1)
    return feat

In [17]:
wine2vec['wine']

array([ 0.08857089, -0.02673696, -0.29440403, -0.11652523, -0.0165947 ,
        0.34733692,  0.12119724, -0.13085279, -0.06005537,  0.47735712,
       -0.29315394,  0.09941099, -0.22264618, -0.05525788,  0.03922538,
       -0.32957935, -0.07000276,  0.13175999,  0.19198187, -0.39061537,
        0.00567096, -0.09007689, -0.37076762, -0.05523124,  0.06314998,
       -0.06796336,  0.1323534 , -0.05944381, -0.06760232,  0.33408388,
        0.34509084,  0.13783219,  0.24588385,  0.14711963, -0.34159037,
       -0.05395457,  0.07144741, -0.23288313,  0.14549492, -0.01733204,
       -0.16669391,  0.13092579,  0.15604024, -0.428776  ,  0.34741983,
       -0.087455  ,  0.54692185,  0.02605283, -0.1368254 ,  0.24426316], dtype=float32)

In [18]:
def getdocvector(sentence):
    sentence=sentence.split()
#     print(sentence)
    vec=np.zeros((1,50))
    total=0
    for word in sentence:
        try:
            vec+=wine2vec[word].reshape(1,50)
            total+=1
        except:pass
        
#     print(vec)
    if total!=0:
        vec=vec*1.0/total
    return vec[0]

In [None]:
a=getdocvector('wine')
a

In [19]:
# train = descriptions[:2*len(data)//3]
# valid = descriptions[2*len(data)//3:]
train = data[:2*len(data)//3]
valid = data[2*len(data)//3:]
# for x in train:
#     print (x)

In [34]:
# a=feature2(data.loc[[2]])
b=data.loc[[2]]
# print(b)
a=feature2(b)

   Unnamed: 0 country                                        description  \
2           2      US  Mac Watson honors the memory of a wine once ma...   

                     designation  points  price    province        region_1  \
2  Special Selected Late Harvest      96   90.0  California  Knights Valley   

  region_2          variety    winery  
2   Sonoma  Sauvignon Blanc  Macauley  


TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [36]:
# a=[getdocvector(train[0]).reshape(1,90000)]
# a
X_train=[np.concatenate((getdocvector(datum['description']),feature2(datum))) for a,datum in train.iterrows()]
X_valid=[np.concatenate((getdocvector(datum['description']),feature2(datum)))for a,datum in valid.iterrows()]


In [37]:
y_train=data[:2*len(data)//3]['price']
y_val=data[2*len(data)//3:]['price']
print(len(X_train[0]))

2302


In [67]:
dtrain=xgb.DMatrix(X_train,y_train)
depth={3,4,5}
eta={0.1,0.3,0.5,0.7}
num_round = 10000
for x in depth:
    for y in eta:
        
        param = {'max_depth':x, 'eta':y, 'silent':1,}

        bst = xgb.train(param, dtrain, num_round)
        dtest=xgb.DMatrix(X_valid)
        y_pred=bst.predict(dtest)
        mse=sum([(x-y)**2 for x,y in zip(y_val,y_pred)])*1.0/len(y_pred)
        print("MSE: ", mse, "Depth: ", x, "Eta: ",y)

MSE:  213.0500414 Depth:  3 Eta:  0.1
MSE:  195.609346751 Depth:  3 Eta:  0.5
MSE:  185.942341032 Depth:  3 Eta:  0.3
MSE:  212.140064608 Depth:  3 Eta:  0.7
MSE:  188.390594616 Depth:  4 Eta:  0.1
MSE:  188.803977265 Depth:  4 Eta:  0.5
MSE:  177.089443449 Depth:  4 Eta:  0.3
MSE:  216.56306033 Depth:  4 Eta:  0.7
MSE:  174.536329118 Depth:  5 Eta:  0.1
MSE:  194.586369661 Depth:  5 Eta:  0.5
MSE:  176.06188205 Depth:  5 Eta:  0.3
MSE:  216.886172529 Depth:  5 Eta:  0.7


In [62]:
dtest=xgb.DMatrix(X_valid)
y_pred=bst.predict(dtest)

In [63]:
mse=sum([(x-y)**2 for x,y in zip(y_val,y_pred)])*1.0/len(y_pred)

In [None]:
y_val

In [64]:
mse

176.06188205039527

In [70]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_valid)


In [71]:
mean = sum([int(d['price']) for a,d in data.iterrows()])/len(data)
variance = sum([(int(d['price']) - mean)**2 for a,d in data.iterrows()])/len(data)

print('Mean: ', mean, ' Variance: ', variance)

mse = sum([(x-y)**2 for x,y in zip(predictions, y_val)])/len(predictions)
print('MSE: ', mse)
print("lol")

Mean:  31.272754605185966  Variance:  559.8094559754887
MSE:  291.422267445
lol
