In [39]:
from collections import Counter

from sklearn.model_selection import train_test_split


import numpy as np
import pandas as pd
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [40]:
#read in the data
path = '/media/popo/Elements/Datasets/Wine Reviews/'
df = pd.read_csv(f'{path}wine_data.csv')
print('Shape: ', df.shape)

Shape:  (129971, 14)


In [41]:
df.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

In [42]:
df['variety'].unique().shape

(708,)

In [43]:
#we will take only the top 5 varieties
top_varieties = df['variety'].value_counts().head(5).index
df_top_5 = df[df['variety'].isin(top_varieties)]
df_top_5.shape


(50358, 14)

In [44]:
df_train_test = df_top_5[['description', 'variety']]

In [45]:
df_train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50358 entries, 4 to 129967
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  50358 non-null  object
 1   variety      50358 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [46]:
#preprocessing
def preprocess_text(df):
    #remove special characters
    df['description'] = df['description'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    #convert to lowercase
    df['description'] = df['description'].apply(lambda x: x.lower())
    #remove stopwords
    stop_words = stopwords.words('english')
    df['description'] = df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    df['description'] = df['description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    #encode the labels
    df['variety'] = df['variety'].factorize()[0]
    return df

In [47]:
df_train_test_processed = preprocess_text(df_train_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(lambda x: ' '

In [48]:
df_train_test_processed.head()

Unnamed: 0,description,variety
4,much like regular bottling come across rather ...,0
10,soft supple plum envelope oaky structure caber...,1
12,slightly reduced wine offer chalky tannic back...,1
14,building year six generation winemaking tradit...,2
20,ripe aroma dark berry mingle ample note black ...,3


In [49]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(df_train_test_processed['description'], df_train_test_processed['variety'], test_size=0.2, random_state=42)

In [50]:
#check how many unique words we have
vocab_size = len(set(' '.join(X_train).split()))
print('Vocab size: ', vocab_size)

Vocab size:  16951


In [76]:
#vocab size
vocab_size = 10000
encoded_docs_train = [one_hot(d, vocab_size) for d in X_train]
encoded_docs_test = [one_hot(d, vocab_size) for d in X_test]

In [77]:
len(encoded_docs_test[0])

27

In [78]:
#pad the sequences
max_length = 100
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')


In [79]:
padded_docs_test.shape

(10072, 100)

In [80]:
padded_docs_test

array([[3754, 8464, 2567, ...,    0,    0,    0],
       [1328, 1806, 4417, ...,    0,    0,    0],
       [4496, 8364, 9229, ...,    0,    0,    0],
       ...,
       [8102, 8755, 2567, ...,    0,    0,    0],
       [8115, 9341, 5895, ...,    0,    0,    0],
       [ 622, 7511, 2567, ...,    0,    0,    0]], dtype=int32)

In [81]:
#max length
max_length = 100
#embedding size
embedding_size = 32
#number of classes
num_classes = 5
#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_length))
#bidirectional LSTM
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 32)           320000    
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 5)                 325       
                                                                 
Total params: 336,965
Trainable params: 336,965
Non-trainable params: 0
_________________________________________________________________


In [82]:
#train the model
model.fit(padded_docs_train, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f12c4ee17f0>

In [83]:
#evaluate the model
loss, accuracy = model.evaluate(padded_docs_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 85.236299
