# sommelier.ai
#### Practical Machine Learning Workshop

### Agenda:
- Data exploration with pandas
- Modeling with scikit-learn

### Documentation
- [pandas](https://pandas.pydata.org/pandas-docs/stable/reference/index.html) docs
- [Modern Pandas](https://tomaugspurger.github.io/modern-1-intro.html) blog series
- [scikit-learn](http://scikit-learn.org/stable/index.html)

## Data Exploration

In [None]:
# These 'magics' alter the behavior of the Jupyter notebook
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

In [None]:
# This cell is all for styling graphs

import matplotlib.pyplot as plt
import seaborn as sns

from workshop import boxplot_sorted

sns.set(style="darkgrid")
plt.style.use("fivethirtyeight")

# https://matplotlib.org/tutorials/introductory/customizing.html
plt.rc("figure", figsize=(11, 8))

FONT_SIZE = 16

plt.rc("font", size=FONT_SIZE)          # controls default text sizes
plt.rc("axes", titlesize=FONT_SIZE)     # fontsize of the axes title
plt.rc("axes", labelsize=FONT_SIZE)     # fontsize of the x and y labels
plt.rc("xtick", labelsize=FONT_SIZE)    # fontsize of the tick labels
plt.rc("ytick", labelsize=FONT_SIZE)    # fontsize of the tick labels
plt.rc("legend", fontsize=FONT_SIZE)    # legend fontsize
plt.rc("figure", titlesize=FONT_SIZE)   # fontsize of the figure title

In [None]:
wine_reviews: Path = Path.cwd() / "data" / "winemag-data.zip"

df: pd.DataFrame = pd.read_csv(wine_reviews)

df.head()

In [None]:
df.shape

In [None]:
df.duplicated().value_counts()

In [None]:
df = df.drop_duplicates()
df.shape

## Indexing

- Use `.loc` for label-based indexing
- Use `.iloc` for positional indexing

You can omit these, but the [Zen Of Python](https://www.python.org/dev/peps/pep-0020/) say __explicit is better than implicit__ and there is a performance penalty from pandas having to infer which indexing method to use.

In [None]:
df.loc[(df.province == "Washington") & (df.points > 98)]

In [None]:
df.iloc[15:18]

In [None]:
# Selecting columns
df[["country", "winery"]]

## How are wines scored?

In [None]:
df.points.describe()

In [None]:
df.points.plot.hist(title="Points")

### How long are descriptions?

This examples shows creating a new column and using the "str" extension for [working with text data](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html).

In [None]:
df["word_count"] = df["description"].str.split().apply(len)
df

In [None]:
df["word_count"].plot.hist(title="Word Count");

In [None]:
# Gets the longest description by word count.
df.iloc[df.word_count.idxmax].description

In [None]:
df.iloc[df.word_count.idxmin].description

In [None]:
df["word_count"].corr(df["points"])

## What are the most common words used to describe wines?

In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
document_term_matrix = vectorizer.fit_transform(df["description"])
term_df = pd.DataFrame.sparse.from_spmatrix(document_term_matrix, columns=vectorizer.get_feature_names())
term_df

In [None]:
term_df.sum().sort_values(ascending=False).head(80)

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html
with pd.option_context("display.max_rows", 100):
    # display is what is used to show an item when it is last element in a cell.
    display(term_df.sum().sort_values(ascending=False).head(80))

In [None]:
# Another command option is setting the max_colwidth to see the complete contents.
with pd.option_context("display.max_colwidth", -1):
    display(df["description"].tail(10))

## How many tasters are there?

In [None]:
len(df.taster_name.unique())

In [None]:
df.taster_name.value_counts()

In [None]:
# Need to be careful of dropna across several different methods, like value_counts.
df.taster_name.value_counts(dropna=False)

In [None]:
# Setting a value with a row and column indexer.
df.loc[df.taster_name.isna(), "taster_name"] = "Unknown"

In [None]:
df.taster_name.value_counts()

## What are the top 20 wineries by number of wines? How do their points compare?

In [None]:
df.winery.isna().any()

In [None]:
# Get an index made up from the twenty most productive wineries (discard the value counts).
top20 = df.winery.value_counts()[:20].index
top20

In [None]:
# Method chaining is considered "good style" in pandas.
(df.loc[df.winery.isin(top20)]
   .groupby("winery")
   .points
   .quantile(0.95)
   .sort_values(ascending=False))

In [None]:
boxplot_sorted(df[df.winery.isin(top20)], by="winery", column="points");

## Are some tasters pickier than others?

In [None]:
boxplot_sorted(df, by="taster_name", column="points");

In [None]:
def get_favorite_wines(name: str) -> pd.Series:
    return (df.loc[df.taster_name == name]
              .groupby("variety")
              .points
              .quantile(0.95)
              .sort_values(ascending=False)
              .head())

get_favorite_wines("Virginie Boone")

In [None]:
get_favorite_wines("Alexander Peartree")

In [None]:
get_favorite_wines("Fiona Adams")

## What are the ten best value wines?

In [None]:
# Can't compute value for wines without a price
df.price.isna().value_counts()

In [None]:
value_df = (df.dropna(subset=["price"])
               .assign(value=lambda x: x.points / x.price)
               .sort_values("value", ascending=False)
               [["title", "points", "price", "value"]])
value_df.head(15)

In [None]:
value_df.loc[priced_df.points >= 90]

In [None]:
value_df.tail()

## What varieties were the most controversial among tasters?

In [None]:
# Varieties with the most variance in points
(df.groupby("variety")
   .points
   .var()
   .dropna()
   .sort_values()
   .tail(15)
   .plot
   .barh());

## Extract Year from Title

Year may be a predictive feature for points, i.e. a "good year" for wine.

In [None]:
def get_year_from_title(df: pd.DataFrame) -> pd.DataFrame:
    # It is polite not to alter the input DataFrame
    result: pd.DataFrame = df.copy()

    year: pd.Series = result.title.str.extract(r"\b((?:19|20)\d{2})\b")

    # Get the median year
    median: int = round(year.dropna().astype(int).median())
    
    # Replace NaNs with the median
    result["year"] = year.replace(pd.NA, median).astype(int)

    return result

In [None]:
df = df.pipe(get_year_from_title)
df

In [None]:
df.points.corr(df.year)

## Aside: `inplace=True`

Pandas maintainers say don't use it. They are [trying to deprecate it](https://github.com/pandas-dev/pandas/issues/16529).

![caption](images/inplace.png)

## Data Challenges
- what are the worst wines in France?
- what is the most reviewed variety during the 2010's?
- what is hightest rated variety?
- what region gets the most consistent reviews?



## Putting it all together

In [None]:
%%time

def read_wine_reviews(path: Path) -> pd.DataFrame:
    return (pd.read_csv(path)
              .drop_duplicates()
              .assign(
                  word_count=lambda x: x.description.str.split().apply(len),
                  taster_name=lambda x: x.taster_name.fillna("Unknown"))
              .pipe(get_year_from_title))

df = read_wine_reviews(wine_reviews)
df

# A very simple sentiment analysis model

We will create a binary classifier that predicts whether a wine is good or not based on the text of the review.

In [None]:
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.compose import make_column_transformer

from workshop import show_most_informative_features

def evaluate(model, X, y):
    predictions = model.predict(X)
    score = metrics.accuracy_score(y, predictions)
    print(f"\nAccuracy: {score:0.4f}\n")

    print(metrics.classification_report(y, predictions))

In [None]:
from sklearn.model_selection import train_test_split

# Keep the demo simple by using a threshold for quality.
threshold = df.points.quantile(0.75)

train_df, test_df, train_labels, test_labels = train_test_split(
    df.drop(columns=["price", "points"]),  # Drop columns from the input data that would be "cheating"
    df.points >= threshold,
    random_state=3) # Fix the random split for reproducibility.

In [None]:
%%time

from sklearn.naive_bayes import MultinomialNB

count_model = make_pipeline(CountVectorizer(), MultinomialNB())

count_model.fit(train_df.description, train_labels)

evaluate(count_model, test_df.description, test_labels)

In [None]:
%%time

tf_idf_model = make_pipeline(
            make_column_transformer(
                (TfidfVectorizer(ngram_range=(1,3), max_df=0.98), "description")),
            SGDClassifier(n_jobs=-1, loss="modified_huber"))

tf_idf_model.fit(train_df, train_labels)

evaluate(tf_idf_model, test_df, test_labels)

In [None]:
show_most_informative_features(tf_idf_model)

In [None]:
%%time

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"))

model = make_pipeline(
            make_column_transformer(
                (TfidfVectorizer(ngram_range=(1,3), max_df=0.98), "description"),
                (categorical_transformer, ["country", "winery"]),
                (make_pipeline(
                    SimpleImputer(strategy="median"),
                    StandardScaler()), ["year"])),
            SGDClassifier(n_jobs=-1, loss="modified_huber"))

model.fit(train_df, train_labels)

predicted = model.predict(test_df)

evaluate(model, test_df, test_labels)

In [None]:
show_most_informative_features(model)

In [None]:
def get_failures(model, X, y, cv=3):
    predicted = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

    print("Confusion matrix (actual x prediction):")
    print(metrics.confusion_matrix(y, predicted))

    fn = X[(y == True) & (predicted == False)]
    fp = X[(y == False) & (predicted == True)]

    return fn, fp

In [None]:
fn, tp = get_failures(model, train_df, train_labels)