### 1. First things first: importing pandas, numpy, etc

In [34]:
# Preliminary - importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

### 2. Reading the CSV file with the data
This csv file below (jennings.csv) contains a subset of the data - it is the tasting notes of one user (userid=70, username="Richard Jennings") who has written approximately 30,000 notes/reviews

In [35]:
# Reading in the data from the CSV file
bigdf = pd.read_csv('Rated_305k.csv', usecols=[1,2,3,4,5,6,7,8,9])

In [36]:
# Examine the data
bigdf.count()

w.name        304827
w.id          304827
w.type        304827
w.year        304827
r.rating      304827
r.time        304827
r.userid      304827
r.username    304827
r.text        304827
dtype: int64

In [37]:
bigdf.head()

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text
0,1981 Chateau de Beaucastel Chateauneuf-du-Pape,18856,Red Rhone Blend,1981.0,96.0,1160179200,1,Eric,"Olive, horse sweat, dirty saddle, and smoke. T..."
1,1995 Chateau Pichon-Longueville Baron,3495,Red Bordeaux Blend,1995.0,93.0,1063929600,1,Eric,A remarkably floral nose with violet and chamb...
2,2001 Thierry Allemand Cornas Reynard,40451,Syrah,2001.0,92.0,1195948800,1,Eric,"Fantastic wine! Blackberry, smoke, olive, stem..."
3,1990 Krug Champagne Brut,16892,Champagne Blend,1990.0,96.0,1134172800,1,Eric,"Much more yeasty, very Krug-like (guessed blin..."
4,2002 Weingut Hirsch Gruner Veltliner Kammern,17478,Gruner Veltliner,2002.0,92.0,1082851200,1,Eric,This shows a very ripe nose with chalky notes....


In [38]:
# We can see that the average rating given is 89.8 points
bigdf.describe()

Unnamed: 0,w.id,w.year,r.rating,r.time,r.userid
count,304827.0,304827.0,304827.0,304827.0,304827.0
mean,464593.5,2003.259701,88.878137,1251482000.0,54832.62231
std,370020.4,6.576554,4.149561,72362890.0,55374.242633
min,1.0,1795.0,50.0,464054400.0,1.0
25%,131636.0,2002.0,87.0,1208650000.0,12625.0
50%,408966.0,2005.0,89.0,1263600000.0,36708.0
75%,709911.0,2007.0,91.0,1306541000.0,81558.0
max,1466790.0,2012.0,100.0,1349482000.0,256379.0


In [39]:
#Getting rid of empty data
bigdf = bigdf.dropna()
bigdf.count()

w.name        304827
w.id          304827
w.type        304827
w.year        304827
r.rating      304827
r.time        304827
r.userid      304827
r.username    304827
r.text        304827
dtype: int64

In [40]:
bigdf['r.rating'].describe()

count    304827.000000
mean         88.878137
std           4.149561
min          50.000000
25%          87.000000
50%          89.000000
75%          91.000000
max         100.000000
Name: r.rating, dtype: float64

In [41]:
bigdf = bigdf[bigdf["w.year"] > 1950]
len(bigdf)

304487

In [42]:
# Make a copy
df = bigdf

In [43]:
df.head()

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text
0,1981 Chateau de Beaucastel Chateauneuf-du-Pape,18856,Red Rhone Blend,1981.0,96.0,1160179200,1,Eric,"Olive, horse sweat, dirty saddle, and smoke. T..."
1,1995 Chateau Pichon-Longueville Baron,3495,Red Bordeaux Blend,1995.0,93.0,1063929600,1,Eric,A remarkably floral nose with violet and chamb...
2,2001 Thierry Allemand Cornas Reynard,40451,Syrah,2001.0,92.0,1195948800,1,Eric,"Fantastic wine! Blackberry, smoke, olive, stem..."
3,1990 Krug Champagne Brut,16892,Champagne Blend,1990.0,96.0,1134172800,1,Eric,"Much more yeasty, very Krug-like (guessed blin..."
4,2002 Weingut Hirsch Gruner Veltliner Kammern,17478,Gruner Veltliner,2002.0,92.0,1082851200,1,Eric,This shows a very ripe nose with chalky notes....


In [44]:
df = df.reset_index(drop=True)

In [45]:
df['r.rating'] = pd.to_numeric(df['r.rating'])

In [46]:
df['w.year'] = pd.to_numeric(df['w.year'])

In [47]:
df['cat'] = np.where(df['r.rating'] < 80, "OK", np.where(df['r.rating'] < 90, "Good", np.where(df['r.rating'] < 96, "Very Good", "Collectible")))

In [48]:
df.head(3)

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
0,1981 Chateau de Beaucastel Chateauneuf-du-Pape,18856,Red Rhone Blend,1981.0,96.0,1160179200,1,Eric,"Olive, horse sweat, dirty saddle, and smoke. T...",Collectible
1,1995 Chateau Pichon-Longueville Baron,3495,Red Bordeaux Blend,1995.0,93.0,1063929600,1,Eric,A remarkably floral nose with violet and chamb...,Very Good
2,2001 Thierry Allemand Cornas Reynard,40451,Syrah,2001.0,92.0,1195948800,1,Eric,"Fantastic wine! Blackberry, smoke, olive, stem...",Very Good


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304487 entries, 0 to 304486
Data columns (total 10 columns):
w.name        304487 non-null object
w.id          304487 non-null int64
w.type        304487 non-null object
w.year        304487 non-null float64
r.rating      304487 non-null float64
r.time        304487 non-null int64
r.userid      304487 non-null int64
r.username    304487 non-null object
r.text        304487 non-null object
cat           304487 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 23.2+ MB


In [50]:
df_cabs = df[df['w.type'] == " Cabernet Sauvignon"]

In [51]:
df_cabs.head()

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
1231,1999 Ross Andrew Winery Cabernet Sauvignon,2006,Cabernet Sauvignon,1999.0,92.0,1099958400,1,Eric,"Wow, this wine continues to deliver. Upon fir...",Very Good
1234,1997 Quilceda Creek Cabernet Sauvignon,379,Cabernet Sauvignon,1997.0,91.0,1117584000,1,Eric,A sweet nose of graham cracker. The palate is...,Very Good
1237,2001 Kiona Cabernet Sauvignon Reserve,4363,Cabernet Sauvignon,2001.0,88.0,1069372800,1,Eric,This has a stunningly Bordeaux-like nose with...,Good
1247,1991 Robert Mondavi Winery Cabernet Sauvignon...,4065,Cabernet Sauvignon,1991.0,97.0,1146268800,1,Eric,Wine of the night. Big notes of cola on the n...,Collectible
1253,1990 Leonetti Cellar Cabernet Sauvignon Washi...,2067,Cabernet Sauvignon,1990.0,91.0,1039305600,1,Eric,"Tasted with Patrick, Walt & Chuck. I REALLY l...",Very Good


In [52]:
df_cabs.reset_index(drop=True)

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
0,1999 Ross Andrew Winery Cabernet Sauvignon,2006,Cabernet Sauvignon,1999.0,92.0,1099958400,1,Eric,"Wow, this wine continues to deliver. Upon fir...",Very Good
1,1997 Quilceda Creek Cabernet Sauvignon,379,Cabernet Sauvignon,1997.0,91.0,1117584000,1,Eric,A sweet nose of graham cracker. The palate is...,Very Good
2,2001 Kiona Cabernet Sauvignon Reserve,4363,Cabernet Sauvignon,2001.0,88.0,1069372800,1,Eric,This has a stunningly Bordeaux-like nose with...,Good
3,1991 Robert Mondavi Winery Cabernet Sauvignon...,4065,Cabernet Sauvignon,1991.0,97.0,1146268800,1,Eric,Wine of the night. Big notes of cola on the n...,Collectible
4,1990 Leonetti Cellar Cabernet Sauvignon Washi...,2067,Cabernet Sauvignon,1990.0,91.0,1039305600,1,Eric,"Tasted with Patrick, Walt & Chuck. I REALLY l...",Very Good
5,1976 Robert Mondavi Winery Cabernet Sauvignon...,159797,Cabernet Sauvignon,1976.0,91.0,1146268800,1,Eric,"Big, plummy, and minty on the fresh nose. Jus...",Very Good
6,1999 Andrew Will Cabernet Sauvignon Pepper Br...,32,Cabernet Sauvignon,1999.0,87.0,1124668800,1,Eric,Consumed across two days. On day 1 this was a...,Good
7,2003 Ross Andrew Winery Cabernet Sauvignon,150807,Cabernet Sauvignon,2003.0,91.0,1179964800,1,Eric,A rather alluring nose with lots of licorice ...,Very Good
8,1994 Quilceda Creek Cabernet Sauvignon,383,Cabernet Sauvignon,1994.0,96.0,1067731200,1,Eric,"Dhiren brought this one, bless his soul. This...",Collectible
9,1997 Gan Eden Cabernet Sauvignon Kosher Limit...,456,Cabernet Sauvignon,1997.0,85.0,1050451200,1,Eric,"This has a somewhat cooked nose, yet on the p...",Good


In [53]:
df_cabs.head()

Unnamed: 0,w.name,w.id,w.type,w.year,r.rating,r.time,r.userid,r.username,r.text,cat
1231,1999 Ross Andrew Winery Cabernet Sauvignon,2006,Cabernet Sauvignon,1999.0,92.0,1099958400,1,Eric,"Wow, this wine continues to deliver. Upon fir...",Very Good
1234,1997 Quilceda Creek Cabernet Sauvignon,379,Cabernet Sauvignon,1997.0,91.0,1117584000,1,Eric,A sweet nose of graham cracker. The palate is...,Very Good
1237,2001 Kiona Cabernet Sauvignon Reserve,4363,Cabernet Sauvignon,2001.0,88.0,1069372800,1,Eric,This has a stunningly Bordeaux-like nose with...,Good
1247,1991 Robert Mondavi Winery Cabernet Sauvignon...,4065,Cabernet Sauvignon,1991.0,97.0,1146268800,1,Eric,Wine of the night. Big notes of cola on the n...,Collectible
1253,1990 Leonetti Cellar Cabernet Sauvignon Washi...,2067,Cabernet Sauvignon,1990.0,91.0,1039305600,1,Eric,"Tasted with Patrick, Walt & Chuck. I REALLY l...",Very Good


In [54]:
X = df_cabs[['r.text']].fillna("")

In [55]:
y = df_cabs['cat']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)

In [57]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [58]:
type(X_train)

pandas.core.frame.DataFrame

In [59]:
type(X_test)

pandas.core.frame.DataFrame

In [60]:
type(y_train)

pandas.core.series.Series

In [61]:
type(y_test)

pandas.core.series.Series

In [62]:
y_test.head()

0    Very Good
1    Very Good
2    Very Good
3         Good
4         Good
Name: cat, dtype: object

### Next steps
Apply NLP model to see if we can predict the rating based on the description

# CountVectorizer

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 2000, 
                             ngram_range=(1, 4), 
                             stop_words='english',
                             binary=True,
                             decode_error="ignore")

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(X_train['r.text'])

# Use `tranform` to generate the sample X_train's word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(X_train['r.text']).toarray()


In [66]:
X1 = X
X1.shape

(5240L, 2000L)

In [67]:
type(y_train)

pandas.core.series.Series

# RandomForest

In [68]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 1000, n_jobs=2)

In [69]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X1, y_train, scoring='accuracy')

print('CV Accuracy {}, Average Accuracy {}'.format(scores, scores.mean()))

CV Accuracy [ 0.70611778  0.72394044  0.73868195], Average Accuracy 0.722913388431


# OneVsRest

In [70]:
from sklearn.multiclass import OneVsRestClassifier

model2 = OneVsRestClassifier(model)

In [71]:
model2.fit(X1,y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
          n_jobs=1)

In [72]:
scores2 = cross_val_score(model2, X1, y_train, scoring='accuracy')
print('CV Accuracy {}, Average Accuracy {}'.format(scores2, scores2.mean()))

CV Accuracy [ 0.70726129  0.7233677   0.74154728], Average Accuracy 0.724058755899


# Fit & Transform

In [73]:
model.fit(X1, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [74]:
X_test_transformed = vectorizer.transform(X_test['r.text']).toarray()

In [76]:
model.classes_

array(['Collectible', 'Good', 'OK', 'Very Good'], dtype=object)

In [77]:
model.get_params

<bound method RandomForestClassifier.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)>

In [78]:
model.score

<bound method RandomForestClassifier.score of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False)>

In [79]:
model.predict_proba(X_test_transformed)

array([[ 0.036    ,  0.207    ,  0.007    ,  0.75     ],
       [ 0.02     ,  0.263625 ,  0.003    ,  0.713375 ],
       [ 0.06     ,  0.1497619,  0.0032381,  0.787    ],
       ..., 
       [ 0.       ,  0.522    ,  0.       ,  0.478    ],
       [ 0.003    ,  0.703    ,  0.021    ,  0.273    ],
       [ 0.072    ,  0.151    ,  0.001    ,  0.776    ]])

In [80]:
model.score(X_test_transformed, y_test)

0.7101200686106347

In [81]:
model2.fit(X1, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=2, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
          n_jobs=1)

In [83]:
model2.predict_proba(X_test_transformed)

array([[ 0.01737781,  0.20961986,  0.00403413,  0.76896819],
       [ 0.00726895,  0.25441329,  0.02076843,  0.71754933],
       [ 0.07561258,  0.13901314,  0.00108018,  0.7842941 ],
       ..., 
       [ 0.        ,  0.53405405,  0.        ,  0.46594595],
       [ 0.00096805,  0.70280736,  0.01161665,  0.28460794],
       [ 0.09661836,  0.14879227,  0.00772947,  0.7468599 ]])

In [84]:
model2.score(X_test_transformed, y_test)

0.69125214408233271