# sommelier.ai
#### Practical Machine Learning Workshop

### Agenda:
- Data Exploration with pandas
- Modeling with scikit-learn

### Tools and Documentation
- [pandas](https://pandas.pydata.org/pandas-docs/stable/api.html)
- [scikit-learn](http://scikit-learn.org/stable/index.html)
- [matplotlib](https://matplotlib.org/api/api_overview.html)


## Data Exploration

In [None]:
# These 'magics' alter the behavior of the Jupyter notebook
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from workshop import boxplot_sorted

sns.set(style="darkgrid")
plt.style.use("fivethirtyeight")

In [None]:
df = pd.read_csv("data/winemag-data.zip")
df.head()

### Loc example

In [None]:
df.loc[(df.province == "Washington") & (df.points > 98)]

In [None]:
df.iloc[45]

## How are wines scored?

In [None]:
df.points.describe()

In [None]:
df.points.plot.hist(title="Points");

## Data Challenges
- what are the worst wines in the US?
- how many tasters are there?
- how long are the descriptions?
- what are the top 20 wineries by number of wines? how do their points compare?
- what is the most produced variety?
- what is hightest rated variety?
- what are the most controversial wine varieties?
- are some tasters pickier than others?
- what are the top 10 best value wines?
- given a taster, what are their favorite varieties?

In [None]:
top20 = df.winery.value_counts(dropna=False)[:20].index
top20

In [None]:
boxplot_sorted(df[df.winery.isin(top20)], by="winery", column="points");

In [None]:
df.groupby("winery")["points"].describe().sort_values("count", ascending=False).head(20)

In [None]:
df["year"] = df.title.str.extract("(19|20\d{2})")

In [None]:
df.taster_name.value_counts()

In [None]:
(df.country
   .value_counts(ascending=True, dropna=False)
   .plot.barh(figsize=(10,12), logx=True));

In [None]:
countries = (df.loc[df.country != '', 'country']
               .unique()
               .tolist())

countries_regex = '(' + '|'.join(countries) + ')'
countries_regex

In [None]:
found_countries = (df.loc[df.country == '', 'description']
                     .str.extract(countries_regex)
                     .dropna())
found_countries

In [None]:
df.loc[found_countries.index, 'country'] = found_countries.values
df.loc[found_countries.index, 'country']

In [None]:
(df.description
   .str.len()
   .plot.hist(title='Description length')
   .set(xlabel="Length"));

In [None]:
df.points.plot.hist();

In [None]:
boxplot_sorted(df, by="taster_name", column="points");

In [None]:
def get_favs(name, min_count=10):
    favs = df[df.taster_name == name].groupby('variety')['points'].describe(percentiles=[.95]).sort_values('95%', ascending=False)
    return favs[favs['count'] >= min_count]

get_favs("Virginie Boone").head(30)

In [None]:
df.groupby(['variety'])['points'].var().dropna().sort_values(ascending=False).head(15).plot.barh();

In [None]:
df.points.describe()

In [None]:
df['is_good'] = df.points > 88

In [None]:
from sklearn import metrics
from sklearn.pipeline import *
from sklearn.feature_extraction.text import *
from sklearn.linear_model import *
from sklearn.naive_bayes import *
from sklearn.model_selection import *
from sklearn.compose import *
from sklearn.impute import *
from sklearn.preprocessing import *

from workshop import show_most_informative_features

def evaluate(model, X, y):
    predictions = model.predict(X)
    score = metrics.accuracy_score(y, predictions)
    print('\nAccuracy: %0.3f' % score)

    print(metrics.classification_report(y, predictions))

In [None]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.drop(columns=['is_good', 'price', 'points']), 
    df.is_good,
    random_state=3)

In [None]:
%%time

count_model = make_pipeline(CountVectorizer(), MultinomialNB())

count_model.fit(train_df.description, train_labels)

evaluate(count_model, test_df.description, test_labels)

In [None]:
%%time

tf_idf_model = make_pipeline(
            make_column_transformer(
                (TfidfVectorizer(ngram_range=(1,3)), "description")),
            SGDClassifier(n_jobs=-1, max_iter=1000))

tf_idf_model.fit(train_df, train_labels)

evaluate(tf_idf_model, test_df, test_labels)

In [None]:
show_most_informative_features(tf_idf_model)

In [None]:
%%time

categorical_features = ['country', 'winery']
categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore'))

model = make_pipeline(
            make_column_transformer(
                (TfidfVectorizer(), "description"),
                (categorical_transformer, categorical_features),
                (make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler()), ["year"])),
            SGDClassifier(n_jobs=-1, max_iter=1000))

model.fit(train_df, train_labels)

predicted = model.predict(test_df)

score = metrics.accuracy_score(test_labels, predicted)
print('\nAccuracy: %0.3f' % score)

print(metrics.classification_report(test_labels, predicted))

In [None]:
show_most_informative_features(model)

In [None]:
def get_failures(model, X, y, cv=3):
    predicted = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

    print("Confusion matrix (actual x prediction):")
    print(metrics.confusion_matrix(y, predicted))

    fn = X[(y == True) & (predicted == False)]
    fp = X[(y == False) & (predicted == True)]

    return fn, fp

In [None]:
fn, tp = get_failures(model, train_df, train_labels)