In [None]:
import numpy as np
import pandas as pd

Data Files can be found here:

https://www.kaggle.com/zynicide/wine-reviews
The dataset was scraped from WineEnthusiest.

The file used is:

winemag-data_first150k.csv

In [None]:
# read in the wine review data, view the data to make sure it appeared
sommData = pd.read_csv('Data/winemag-data_first150k.csv')
sommData.head()

In [None]:
# trim the data to remove the unnamed column
sommData = sommData[sommData.columns[1:11]]
sommData.head()

In [None]:
# replace NaN with XXXXX for each  string based column
stringColList = ["country", "description", "designation", "province", "region_1", "region_2", "variety", "winery"]
for col in stringColList:
    sommData[col].fillna("none", inplace=True)
    
sommData.head()

In [None]:
# replace Nan with 0 for number based columns
numColList = ["points", "price"]
for col in numColList:
    sommData[col].fillna(0, inplace=True)
    
sommData.head()

In [None]:
# remove dupes
dups = sommData[sommData.duplicated('description')]
sommData = sommData.drop_duplicates(subset='description')
print('Total unique reviews:', len(sommData))

In [None]:
# remove varities with only 1 description
counts = sommData['variety'].value_counts()
sommData= sommData[sommData['variety'].isin(counts[counts > 3].index)]
sommData.head()

In [None]:
# create the X and y values
X = sommData["description"]
y = sommData["variety"]

print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.externals import joblib

# split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)


# vectorize the description data
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
joblib.dump(vectorizer, 'vectorizer.pkl') 
X_train_vec = vectorizer.transform(X_train)

X_test_vec = vectorizer.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
joblib.dump(label_encoder, 'label_encoder.pkl') 
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

print(y_test_categorical.shape, y_train_categorical.shape)

In [None]:
# Create Deep Learning Model
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=750, activation='relu', input_dim=X_train_vec.shape[1]))
#model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=364, activation='softmax'))

In [None]:
# compile the model and fit it
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(
    X_train_vec,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=364
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_vec, y_test_categorical, verbose=364)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# test the data
encoded_predictions = model.predict_classes(X_test_vec[100:105])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_train[100:105])}")

In [None]:
# save the model
model.save("ML_Somm.h5")

In [None]:
# load the saved model
from keras.models import load_model
model = load_model("ML_Somm.h5")

In [None]:
# code to run user input 
test = ["Currant, Plum, Black Cherry & Spice, with notes of Olive, Vanilla Mint, Tobacco, Toasty Cedar, Anise, Pepper & Herbs"]
test_vec = vectorizer.transform(test)
encoded_test = model.predict_classes(test_vec)
predict_lable = label_encoder.inverse_transform(encoded_test)
print(predict_lable)

In [None]:
# load the saved model
from keras.models import load_model
model = load_model("ML_Somm.h5")

In [3]:
# code required for the twitter bot:
import boto3
import botocore

BUCKET_NAME = 'mlsomm' # replace with your bucket name
KEY = 'ML_Somm.h5' # replace with your object key

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'somm_model.h5')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [4]:
# code required for the twitter bot:

# import dependencies
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

from keras.models import load_model

vectorizer = joblib.load('vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl') 



# load the model
model = load_model("somm_model.h5")
# model = load_model("ML_Somm.h5")

# test the data
test = ["Currant, Plum, Black Cherry & Spice, with notes of Olive, Vanilla Mint, Tobacco, Toasty Cedar, Anise, Pepper & Herbs"]
test_vec = vectorizer.transform(test)
encoded_test = model.predict_classes(test_vec)
predict_label = label_encoder.inverse_transform(encoded_test)
print(predict_label)

Using TensorFlow backend.


['Red Blend']
