### 1. First things first: importing pandas, numpy, etc

In [1]:
# Preliminary - importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

### 2. Reading the CSV file with the data
This csv file below (jennings.csv) contains a subset of the data - it is the tasting notes of one user (userid=70, username="Richard Jennings") who has written approximately 30,000 notes/reviews

In [4]:
# Reading in the data from the CSV file
bigdf = pd.read_csv('jennings.csv', usecols=[1,2,3,4,5,6,7,8])

In [5]:
bigdf.head()

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text
0,540265,Syrah,2006,94,1255478400,70,Richard Jennings,"Lovely pepper, lavender, mineral nose; tasty ..."
1,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher..."
2,1210627,"Menu Pineau, Arbois",2010,91,1330214400,70,Richard Jennings,"Light yellow color; mineral, tart green apple..."
3,881585,Riesling,2005,91,1279929600,70,Richard Jennings,"Light butter yellow color; nice herbal, mint,..."
4,146940,Chardonnay,2001,89,1125187200,70,Richard Jennings,"Nice citrus nose; tasty, citrus, mineral pala..."


In [6]:
# Examine the data
bigdf.count()

w.id          29511
w.type        29511
w.year        29511
r.rating      29511
r.time        29511
r.userid      29511
r.username    29511
r.text        29511
dtype: int64

In [7]:
# We can see that the average rating given is 89.8 points
bigdf.describe()

Unnamed: 0,w.id,w.year,r.rating,r.time,r.userid
count,29511.0,29511.0,29511.0,29511.0,29511.0
mean,605341.6,2002.239233,89.766257,1249956000.0,70.0
std,442597.7,11.429247,3.397797,75244530.0,0.0
min,2.0,1805.0,50.0,1044403000.0,70.0
25%,183129.5,2001.0,88.0,1207958000.0,70.0
50%,592964.0,2006.0,90.0,1268870000.0,70.0
75%,959555.5,2008.0,92.0,1310429000.0,70.0
max,1450596.0,2011.0,100.0,1347494000.0,70.0


In [8]:
#Getting rid of empty data
bigdf = bigdf.dropna()
bigdf.count()

w.id          29511
w.type        29511
w.year        29511
r.rating      29511
r.time        29511
r.userid      29511
r.username    29511
r.text        29511
dtype: int64

In [11]:
# Make a copy
df = bigdf

In [12]:
df.head()

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text
0,540265,Syrah,2006,94,1255478400,70,Richard Jennings,"Lovely pepper, lavender, mineral nose; tasty ..."
1,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher..."
2,1210627,"Menu Pineau, Arbois",2010,91,1330214400,70,Richard Jennings,"Light yellow color; mineral, tart green apple..."
3,881585,Riesling,2005,91,1279929600,70,Richard Jennings,"Light butter yellow color; nice herbal, mint,..."
4,146940,Chardonnay,2001,89,1125187200,70,Richard Jennings,"Nice citrus nose; tasty, citrus, mineral pala..."


In [13]:
df = df.reset_index(drop=True)

In [14]:
df['r.rating'] = pd.to_numeric(df['r.rating'])

In [15]:
df['w.year'] = pd.to_numeric(df['w.year'])

In [16]:
df['cat'] = np.where(df['r.rating'] < 80, "OK", np.where(df['r.rating'] < 90, "Good", np.where(df['r.rating'] < 96, "Very Good", "Collectible")))

In [17]:
df.head(3)

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
0,540265,Syrah,2006,94,1255478400,70,Richard Jennings,"Lovely pepper, lavender, mineral nose; tasty ...",Very Good
1,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher...",Good
2,1210627,"Menu Pineau, Arbois",2010,91,1330214400,70,Richard Jennings,"Light yellow color; mineral, tart green apple...",Very Good


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29511 entries, 0 to 29510
Data columns (total 9 columns):
w.id          29511 non-null int64
w.type        29511 non-null object
w.year        29511 non-null int64
r.rating      29511 non-null int64
r.time        29511 non-null int64
r.userid      29511 non-null int64
r.username    29511 non-null object
r.text        29511 non-null object
cat           29511 non-null object
dtypes: int64(5), object(4)
memory usage: 2.0+ MB


In [19]:
df_cabs = df[df['w.type'] == " Cabernet Sauvignon"]

In [20]:
df_cabs.head()

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
1,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher...",Good
23,1104266,Cabernet Sauvignon,2007,89,1318377600,70,Richard Jennings,Very dark red violet color; tart currant nose...,Good
45,1299253,Cabernet Sauvignon,2006,86,1321142400,70,Richard Jennings,"Dark red violet color; ripe black currant, bl...",Good
68,36950,Cabernet Sauvignon,1988,92,1223510400,70,Richard Jennings,"Group's #3 (my #3) ? 39 pts; 0, 3, 4, 1 ? Dar...",Very Good
69,41880,Cabernet Sauvignon,1984,91,1161907200,70,Richard Jennings,"Dark fig and black fruit nose; concentrated, ...",Very Good


In [21]:
df_cabs.reset_index(drop=True)

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
0,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher...",Good
1,1104266,Cabernet Sauvignon,2007,89,1318377600,70,Richard Jennings,Very dark red violet color; tart currant nose...,Good
2,1299253,Cabernet Sauvignon,2006,86,1321142400,70,Richard Jennings,"Dark red violet color; ripe black currant, bl...",Good
3,36950,Cabernet Sauvignon,1988,92,1223510400,70,Richard Jennings,"Group's #3 (my #3) ? 39 pts; 0, 3, 4, 1 ? Dar...",Very Good
4,41880,Cabernet Sauvignon,1984,91,1161907200,70,Richard Jennings,"Dark fig and black fruit nose; concentrated, ...",Very Good
5,563010,Cabernet Sauvignon,2004,90,1219622400,70,Richard Jennings,"Minty, cassis nose; blackberry and cassis pal...",Very Good
6,581721,Cabernet Sauvignon,1976,87,1303084800,70,Richard Jennings,"Group's #7 (my #6) ? 71 pts.; 2, 1, 0, 4 - sl...",Good
7,1353168,Cabernet Sauvignon,2003,89,1240444800,70,Richard Jennings,Tart red fruit and Cab Franc nose; tart Cab F...,Good
8,1036927,Cabernet Sauvignon,2008,91,1290211200,70,Richard Jennings,"Berry, plum, cassis nose; tasty, plush, berry...",Very Good
9,1421695,Cabernet Sauvignon,2009,90,1341964800,70,Richard Jennings,"Dark red violet color; baked black fruit, ber...",Very Good


In [22]:
df_cabs.head()

Unnamed: 0,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
1,111701,Cabernet Sauvignon,1997,86,1077926400,70,Richard Jennings,"Roasted berry, sherry and VA nose; light cher...",Good
23,1104266,Cabernet Sauvignon,2007,89,1318377600,70,Richard Jennings,Very dark red violet color; tart currant nose...,Good
45,1299253,Cabernet Sauvignon,2006,86,1321142400,70,Richard Jennings,"Dark red violet color; ripe black currant, bl...",Good
68,36950,Cabernet Sauvignon,1988,92,1223510400,70,Richard Jennings,"Group's #3 (my #3) ? 39 pts; 0, 3, 4, 1 ? Dar...",Very Good
69,41880,Cabernet Sauvignon,1984,91,1161907200,70,Richard Jennings,"Dark fig and black fruit nose; concentrated, ...",Very Good


In [24]:
X = df_cabs[['r.text']].fillna("")

In [25]:
y = df_cabs['cat']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

In [27]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [28]:
type(X_train)

pandas.core.frame.DataFrame

In [29]:
type(X_test)

pandas.core.frame.DataFrame

In [30]:
type(y_train)

pandas.core.series.Series

In [31]:
type(y_test)

pandas.core.series.Series

### Next steps
Apply NLP model to see if we can predict the rating based on the description

# CountVectorizer

In [33]:
X = X_train['r.text']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 2000, 
                             ngram_range=(1, 4), 
                             stop_words='english',
                             binary=True,
                             decode_error="ignore")

vectorizer.fit(X)

X_vectorized = vectorizer.transform(X).toarray()


In [37]:
type(X_vectorized)

numpy.ndarray

# RandomForest (model)

In [41]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 2000, n_jobs=2)

In [42]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_vectorized, y_train, scoring='accuracy')

print('CV Accuracy {}, Average Accuracy {}'.format(scores, scores.mean()))

CV Accuracy [ 0.78763867  0.80095541  0.80701754], Average Accuracy 0.798537208884


In [43]:
model.fit(X_vectorized, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

# OneVsRest (model1)

In [44]:
from sklearn.multiclass import OneVsRestClassifier

model2 = OneVsRestClassifier(model)

In [45]:
model2.fit(X_vectorized,y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
          n_jobs=1)

In [46]:
scores2 = cross_val_score(model2, X_vectorized,  y_train, scoring='accuracy')
print('CV Accuracy {}, Average Accuracy {}'.format(scores2, scores2.mean()))

CV Accuracy [ 0.7844691   0.79936306  0.81020734], Average Accuracy 0.798013163507


# Fit & Transform

In [47]:
X_test_transformed = vectorizer.transform(X_test['r.text']).toarray()

In [51]:
y_predicted = model.predict(X_test_transformed)

In [54]:
model.classes_

array(['Collectible', 'Good', 'OK', 'Very Good'], dtype=object)

In [55]:
model.get_params

<bound method RandomForestClassifier.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)>

In [56]:
model.score

<bound method RandomForestClassifier.score of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)>

In [57]:
model.predict_proba(X_test_transformed)

array([[  1.00000000e-03,   1.05000000e-02,   0.00000000e+00,
          9.88500000e-01],
       [  5.00000000e-04,   4.70000000e-02,   1.00000000e-03,
          9.51500000e-01],
       [  0.00000000e+00,   4.85000000e-02,   0.00000000e+00,
          9.51500000e-01],
       [  0.00000000e+00,   2.75000000e-02,   5.00000000e-04,
          9.72000000e-01],
       [  5.00000000e-03,   2.26500000e-01,   5.00000000e-04,
          7.68000000e-01],
       [  0.00000000e+00,   9.50500000e-01,   9.50000000e-03,
          4.00000000e-02],
       [  6.50000000e-03,   5.30000000e-01,   7.00000000e-03,
          4.56500000e-01],
       [  2.50000000e-03,   6.74500000e-01,   2.92500000e-01,
          3.05000000e-02],
       [  6.00000000e-03,   9.90000000e-02,   0.00000000e+00,
          8.95000000e-01],
       [  1.30000000e-02,   1.01000000e-01,   0.00000000e+00,
          8.86000000e-01],
       [  1.40000000e-02,   1.51500000e-01,   0.00000000e+00,
          8.34500000e-01],
       [  2.50000000e

In [58]:
model.score(X_test_transformed, y_test)

0.84285714285714286

In [59]:
model2.predict(X_test_transformed)

array(['Very Good', 'Very Good', 'Very Good', 'Very Good', 'Very Good',
       'Good', 'Good', 'Good', 'Very Good', 'Very Good', 'Very Good',
       'Good', 'Very Good', 'Very Good', 'Good', 'Good', 'Good',
       'Very Good', 'Good', 'Very Good', 'Good', 'Good', 'Good',
       'Very Good', 'Very Good', 'Very Good', 'Very Good', 'Very Good',
       'Good', 'Good', 'Very Good', 'Very Good', 'Good', 'Very Good',
       'Very Good', 'Very Good', 'Very Good', 'Very Good', 'Good',
       'Very Good', 'Good', 'Very Good', 'Very Good', 'Very Good',
       'Very Good', 'Very Good', 'Very Good', 'Very Good', 'Good',
       'Very Good', 'Good', 'Good', 'Good', 'Very Good', 'Good',
       'Very Good', 'Good', 'Very Good', 'Very Good', 'Good', 'Very Good',
       'Very Good', 'Very Good', 'Very Good', 'Very Good', 'Good',
       'Very Good', 'Very Good', 'Very Good', 'Very Good', 'Good', 'Good',
       'Very Good', 'Very Good', 'Very Good', 'Good', 'Good', 'Good',
       'Good', 'Very Good', 'Very

In [60]:
model2.predict_proba(X_test_transformed)

array([[  1.50602410e-03,   9.53815261e-03,   2.00803213e-03,
          9.86947791e-01],
       [  3.46878097e-03,   5.45094153e-02,   4.95540139e-04,
          9.41526264e-01],
       [  0.00000000e+00,   4.61080813e-02,   1.98314328e-03,
          9.51908775e-01],
       [  0.00000000e+00,   3.15737543e-02,   1.08534780e-02,
          9.57572768e-01],
       [  3.58056266e-03,   2.31713555e-01,   5.11508951e-04,
          7.64194373e-01],
       [  1.62059104e-02,   9.10390848e-01,   3.14585319e-02,
          4.19447092e-02],
       [  1.36986301e-02,   5.19569472e-01,   1.46771037e-03,
          4.65264188e-01],
       [  1.01163379e-03,   6.80829540e-01,   2.88315630e-01,
          2.98431968e-02],
       [  1.23885035e-02,   8.57284440e-02,   4.95540139e-04,
          9.01387512e-01],
       [  5.10725230e-03,   1.00612870e-01,   4.08580184e-03,
          8.90194076e-01],
       [  1.23329908e-02,   1.23843782e-01,   0.00000000e+00,
          8.63823227e-01],
       [  2.63296472e

In [61]:
model2.score(X_test_transformed, y_test)

0.84285714285714286