In [2]:
import pandas as pd
import numpy  as np
import sklearn as sk
import math as ma
import scipy.stats as st
import matplotlib
from matplotlib import pylab as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [27]:
# load the data
df = pd.read_csv('Data/whiskey_reviews.csv')
# drop the ID
df.head()
print(df.shape)

(2247, 7)


In [4]:
print(df['currency'].value_counts())

$    2247
Name: currency, dtype: int64


In [5]:
df.drop(columns=['ID','currency'],inplace=True)
df.head()

Unnamed: 0,name,category,review.point,price,description
0,"Johnnie Walker Blue Label, 40%",Blended Scotch Whisky,97,225,"Magnificently powerful and intense. Caramels, ..."
1,"Black Bowmore, 1964 vintage, 42 year old, 40.5%",Single Malt Scotch,97,4500,What impresses me most is how this whisky evol...
2,"Bowmore 46 year old (distilled 1964), 42.9%",Single Malt Scotch,97,13500,There have been some legendary Bowmores from t...
3,"Compass Box The General, 53.4%",Blended Malt Scotch Whisky,96,325,With a name inspired by a 1926 Buster Keaton m...
4,"Chivas Regal Ultis, 40%",Blended Malt Scotch Whisky,96,160,"Captivating, enticing, and wonderfully charmin..."


In [6]:
print(df['category'].value_counts())

Single Malt Scotch            1819
Blended Scotch Whisky          211
Blended Malt Scotch Whisky     132
Single Grain Whisky             57
Grain Scotch Whisky             28
Name: category, dtype: int64


In [7]:
df['price'].describe()

count     2247
unique     453
top        100
freq        85
Name: price, dtype: object

In [8]:
df.loc[df['price'] == '$15,000 or $60,000/set']

Unnamed: 0,name,category,review.point,price,description
19,"Balvenie 1973 43 year old, 46.6%",Single Malt Scotch,95,"$15,000 or $60,000/set",This expression was matured in a European oak ...


In [9]:
df['price']=df['price'].apply(lambda x:x.replace(',',''))
df['price']=df['price'].apply(lambda x:x.replace('.',''))
df['description']=df['description'].apply(lambda x:x.replace(',',''))
df['name']=df['name'].apply(lambda x:x.replace('.',''))

In [10]:
df['price'] = df['price'].replace(['$15000 or $60000/set'], '15000')
df['price'] = df['price'].replace(['60000/set'], '60000')
df['price'] = df['price'].replace(['44/liter'], '44')

In [11]:
df['price'] = df['price'].astype(int)

In [12]:
df['price'].describe()

count      2247.000000
mean        709.205607
std        5660.577607
min          12.000000
25%          70.000000
50%         110.000000
75%         200.000000
max      157000.000000
Name: price, dtype: float64

In [13]:
df['review.point'].describe()

count    2247.000000
mean       86.700045
std         4.054055
min        63.000000
25%        84.000000
50%        87.000000
75%        90.000000
max        97.000000
Name: review.point, dtype: float64

In [14]:
hotenc=OneHotEncoder(sparse=False)

df['category'].unique()

df_cat=hotenc.fit_transform(df[['category']])
hotenc.get_feature_names()

df_cat= pd.DataFrame(df_cat)
df_cat.columns = hotenc.get_feature_names()

df_cat.head()

Unnamed: 0,x0_Blended Malt Scotch Whisky,x0_Blended Scotch Whisky,x0_Grain Scotch Whisky,x0_Single Grain Whisky,x0_Single Malt Scotch
0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [15]:
df=df.drop(['category'],axis=1)

In [16]:
df=pd.concat([df,df_cat],axis=1)
df.head()

Unnamed: 0,name,review.point,price,description,x0_Blended Malt Scotch Whisky,x0_Blended Scotch Whisky,x0_Grain Scotch Whisky,x0_Single Grain Whisky,x0_Single Malt Scotch
0,"Johnnie Walker Blue Label, 40%",97,225,Magnificently powerful and intense. Caramels d...,0.0,1.0,0.0,0.0,0.0
1,"Black Bowmore, 1964 vintage, 42 year old, 405%",97,4500,What impresses me most is how this whisky evol...,0.0,0.0,0.0,0.0,1.0
2,"Bowmore 46 year old (distilled 1964), 429%",97,13500,There have been some legendary Bowmores from t...,0.0,0.0,0.0,0.0,1.0
3,"Compass Box The General, 534%",96,325,With a name inspired by a 1926 Buster Keaton m...,1.0,0.0,0.0,0.0,0.0
4,"Chivas Regal Ultis, 40%",96,160,Captivating enticing and wonderfully charming ...,1.0,0.0,0.0,0.0,0.0


In [17]:
sscaler = StandardScaler()
mmscaler = MinMaxScaler()

In [18]:
df[['review.point']]=mmscaler.fit_transform(df[['review.point']])
df[['price']]=sscaler.fit_transform(df[['price']])

In [19]:
df.head()

Unnamed: 0,name,review.point,price,description,x0_Blended Malt Scotch Whisky,x0_Blended Scotch Whisky,x0_Grain Scotch Whisky,x0_Single Grain Whisky,x0_Single Malt Scotch
0,"Johnnie Walker Blue Label, 40%",1.0,-0.085559,Magnificently powerful and intense. Caramels d...,0.0,1.0,0.0,0.0,0.0
1,"Black Bowmore, 1964 vintage, 42 year old, 405%",1.0,0.669832,What impresses me most is how this whisky evol...,0.0,0.0,0.0,0.0,1.0
2,"Bowmore 46 year old (distilled 1964), 429%",1.0,2.26013,There have been some legendary Bowmores from t...,0.0,0.0,0.0,0.0,1.0
3,"Compass Box The General, 534%",0.970588,-0.067889,With a name inspired by a 1926 Buster Keaton m...,1.0,0.0,0.0,0.0,0.0
4,"Chivas Regal Ultis, 40%",0.970588,-0.097044,Captivating enticing and wonderfully charming ...,1.0,0.0,0.0,0.0,0.0


In [20]:
#using NLTK library, we can do lot of text preprocesing
import nltk as nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
#function to split text into word
#tokens = word_tokenize("The quick brown fox jumps over the lazy dog")
#nltk.download('stopwords')
#print(tokens)

In [21]:
df['description']=df['description'].str.lower()
df['name']=df['name'].str.lower()

In [22]:
stop_words=set(stopwords.words('english'))

In [23]:
from nltk.tokenize import word_tokenize

In [26]:
description = df.description.str.cat(sep=' ')
#function to split text into word
tokens = word_tokenize(description)
vocabulary = set(tokens)
print(len(vocabulary))
frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]

10451


['.',
 'and',
 'the',
 'a',
 'of',
 'with',
 'in',
 'is',
 'this',
 'to',
 'on',
 '’',
 'it',
 'finish',
 '(',
 ')',
 's',
 'palate',
 'notes',
 'nose',
 'whisky',
 'oak',
 'vanilla',
 'but',
 'fruit',
 'sweet',
 'for',
 'sherry',
 'more',
 'smoke',
 'old',
 'that',
 'from',
 'malt',
 'an',
 'chocolate',
 'has',
 'very',
 'its',
 'year',
 'as',
 'toffee',
 'there',
 'peat',
 'cask',
 'spice',
 'fruits',
 'honey',
 'casks',
 'are']