In [None]:
!pip3 install lxml

In [1]:
import json
import pandas as pd
import re
import numpy as np
from bs4 import BeautifulSoup

import logging

import time
import random


from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from collections import defaultdict

import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import itertools

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
with open('dataset.json') as f:
    data = json.load(f)
    
df = pd.DataFrame.from_dict(data, orient='columns')

In [3]:
df.head()

Unnamed: 0,title,date,body,categories,tags
0,"<h2 class=""entry-title"" itemprop=""headline""><a...","<span class=""posted-on""><a href=""http://quebec...","<div class=""entry-content"" itemprop=""text"">\n\...","<span class=""cat-links""><span class=""screen-re...","<span class=""tags-links""><span class=""screen-r..."
1,"<h2 class=""entry-title"" itemprop=""headline""><a...","<span class=""posted-on""><a href=""http://quebec...","<div class=""entry-content"" itemprop=""text"">\n\...","<span class=""cat-links""><span class=""screen-re...","<span class=""tags-links""><span class=""screen-r..."
2,"<h2 class=""entry-title"" itemprop=""headline""><a...","<span class=""posted-on""><a href=""http://quebec...","<div class=""entry-content"" itemprop=""text"">\n\...","<span class=""cat-links""><span class=""screen-re...","<span class=""tags-links""><span class=""screen-r..."
3,"<h2 class=""entry-title"" itemprop=""headline""><a...","<span class=""posted-on""><a href=""http://quebec...","<div class=""entry-content"" itemprop=""text"">\n\...","<span class=""cat-links""><span class=""screen-re...","<span class=""tags-links""><span class=""screen-r..."
4,"<h2 class=""entry-title"" itemprop=""headline""><a...","<span class=""posted-on""><a href=""http://quebec...","<div class=""entry-content"" itemprop=""text"">\n\...","<span class=""cat-links""><span class=""screen-re...","<span class=""tags-links""><span class=""screen-r..."


In [4]:
experts = ['André', 'Patrick', 'RV'] 

In [5]:
def isNan(x): return x != x

def get_expert_rating(x): 
    if isNan(x): return x
    
    name_rating = (x[0].split(' '))
    return np.nan if len(name_rating) < 2 else name_rating[1].split('%')[0]

def remove_empty(x):
    if isNan(x): return x
    return x[1:] if x[0] == "\xa0" else x

def get_expert_name(x):
    if isNan(x): return x
    if len(x) == 0: return np.nan
    
    lenght = len(x[0])
    return x[0].split(' ')[0] if lenght > 4 and lenght < 12 else np.nan

def remove_unnecessary_text(text):
    return np.nan if len(text) == 1 or len(text[0]) > 13 else text

def get_cleaned_body(x):
    body_list = x[0].split(' ')
    
    cleaned_body_list = []
    
    for i, y in enumerate(body_list):        
        if y in experts:
            cleaned_body_list.append(x)
        else:
            cleaned_body_list.append(x[1:])
    
    return remove_unnecessary_text(cleaned_body_list[0])

def get_body(raw_html):
    clean_text = BeautifulSoup(raw_html, "lxml").text.split('\n')
    return list(filter(None, clean_text))

def get_degree(x):
    degree = x[0].split('%')[0]
    try:
        degree = float(degree)
    except ValueError:
        degree = np.nan
        
    return degree

def clean_title(x):
    return BeautifulSoup(x, "lxml").a.text if x else np.nan

def clean_tags(x):
    return BeautifulSoup(x, "lxml").a.text if x else np.nan

def clean_date(x):
    return BeautifulSoup(x, "lxml").time.text

def clean_categories(x):
    x = cleanhtml(x)
    return x[0].replace('Categories ', '')

def cleanhtml(raw_html):
    clean_text = BeautifulSoup(raw_html, "lxml").text.split('\n')
    return list(filter(None, clean_text))

In [6]:
def clean_dataframe(df):
    df['categories'] = df['categories'].apply(lambda x: clean_categories(x))
    df['date'] = df['date'].apply(lambda x: clean_date(x))
    df['tags'] = df['tags'].apply(lambda x: clean_tags(x))
    df['title'] = df['title'].apply(lambda x: clean_title(x))
    df['body'] = df['body'].apply(lambda x: get_body(x))
    df['alcohol_degree'] = df['body'].apply(lambda x: get_degree(x))
    df['body'] = df['body'].apply(lambda x: x[1:])
    df['body'] = df['body'].apply(lambda x: get_cleaned_body(x))
    df['body'] = df['body'].apply(lambda i: remove_empty(i))

    df['expert_name'] = df['body'].apply(lambda x: get_expert_name(x))
    df['rating'] = df['body'].apply(lambda x: get_expert_rating(x))
    df['body'] = df['body'].apply(lambda x: x[1:] if not isNan(x) else x)
    df['description'] = df['body'].apply(lambda x: x[0] if not isNan(x) else x)
    df['body'] = df['body'].apply(lambda x: x[1:] if not isNan(x) else x)
    
    return df

In [7]:
df = clean_dataframe(df)

In [8]:
df_2 = df.copy()

In [9]:
df.drop('body', inplace=True, axis=1)

In [10]:
df = df[pd.notna(df.categories)]
df = df[pd.notna(df.description)]
df = df[df['categories'].isin([
    'Blends',
    'Highlands',
    'Islay',
    'Speyside',
    'Whiskey Américain',
    'Whiskey Irlandais',
    'Whisky Canadien',
    'Whisky Japonais'
])]
cats, descs = zip(*df[['categories', 'description']].values)

In [11]:
text_clf = Pipeline([
        ('tfidf', TfidfVectorizer(
            tokenizer=lambda x: word_tokenize(x, language='french'),
            lowercase=True,
            # ngram_range=(1, 3),
            stop_words=stopwords.words('french')
        )),
        ('clf', LogisticRegression(solver='liblinear')),
    ])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(descs, cats, test_size=0.33, random_state=42)

In [13]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(stop_words=['au', 'aux', 'avec', 'ce', 'ces',
                                             'dans', 'de', 'des', 'du', 'elle',
                                             'en', 'et', 'eux', 'il', 'ils',
                                             'je', 'la', 'le', 'les', 'leur',
                                             'lui', 'ma', 'mais', 'me', 'même',
                                             'mes', 'moi', 'mon', 'ne', 'nos', ...],
                                 tokenizer=<function <lambda> at 0x7fb07a1218b0>)),
                ('clf', LogisticRegression(solver='liblinear'))])

In [16]:
# def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
#     """
#     This function prints and plots the confusion matrix.
#     Normalization can be applied by setting `normalize=True`.
#     """
#     if normalize:
#         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#         print("Normalized confusion matrix")
#     else:
#         print('Confusion matrix, without normalization')

#     plt.figure(figsize=(20, 20))
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=90)
#     plt.yticks(tick_marks, classes)

#     fmt = '.2f' if normalize else 'd'
#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#         plt.text(j, i, format(cm[i, j], fmt),
#                  horizontalalignment="center",
#                  color="white" if cm[i, j] > thresh else "black")

#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.show()
    
def evaluate_baseline(model, X_test, y_test):
    logging.info("Predicting on test...")
    predicted = model.predict(X_test)
    acc = np.mean(predicted == y_test)
    logging.info("Accuracy on test: {}".format(acc))
    print(classification_report(y_test, predicted))
#     plot_confusion_matrix(confusion_matrix(y_test, predicted), sorted(set(y_test)))


    # Analyzing results here
    clf = model.steps[1][1]
    tf_idf_vectorizer = model.steps[0][1]
    feature_names = tf_idf_vectorizer.get_feature_names()
    for cls, coefs in zip(clf.classes_, clf.coef_):
        print("="*20)
        print(cls)
        sorted_coefs = coefs.argsort()

        topk_good_words = sorted_coefs[-10:][::-1]
        good_words = {feature_names[i] for i in topk_good_words}
        print("Top good words: {}".format(good_words))

        topk_bad_words = sorted_coefs[:10][::-1]
        bad_words = {feature_names[i] for i in topk_bad_words}
        print("Top bad words: {}".format(bad_words))
        print("=" * 20)


In [17]:
evaluate_baseline(text_clf, X_test, y_test)

                   precision    recall  f1-score   support

           Blends       0.00      0.00      0.00        66
        Highlands       0.67      0.02      0.04       102
            Islay       0.72      0.90      0.80       146
         Speyside       0.43      0.88      0.58       195
Whiskey Américain       0.66      0.78      0.72       144
Whiskey Irlandais       0.00      0.00      0.00        32
  Whisky Canadien       0.95      0.24      0.38        76
  Whisky Japonais       0.00      0.00      0.00        14

         accuracy                           0.56       775
        macro avg       0.43      0.35      0.31       775
     weighted avg       0.55      0.56      0.47       775

Blends
Top good words: {'fumée', 'blended', 'rapidement', 'grains', 'céréales', 'complexe', 'grain', 'miel', 'équilibre', 'blend'}
Top bad words: {'’', 'rouge', 'alcool', 'cerises', 'pomme', 'bourbon', 'rye', 'distillerie', 'bois', 'malt'}
Highlands
Top good words: {'rien', 'raisins', 'di

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
tf_idf_vectorizer = text_clf.steps[0][1]
tf_idf_vectorizer.get_feature_names()

['!',
 '#',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 ',',
 '-même',
 '.',
 '..',
 '/',
 '//',
 '///',
 '/volume',
 '0-3/4',
 '1',
 '1.',
 '1.14',
 '1.50',
 '10',
 '10,000',
 '10-12',
 '10-15',
 '10.76',
 '10/2013',
 '100',
 '10000',
 '101',
 '105',
 '106',
 '10eme',
 '10ml',
 '10yo',
 '11',
 '11.99',
 '12',
 '12,15',
 '12.99',
 '120',
 '125',
 '125th',
 '129.65',
 '12yo',
 '13',
 '130',
 '1300',
 '131',
 '1316e',
 '132',
 '135',
 '14',
 '1400',
 '14yo',
 '15',
 '15-20',
 '150',
 '150-200',
 '157',
 '15eme',
 '16',
 '160',
 '1600',
 '1608…',
 '16yo',
 '17',
 '17/0121',
 '170',
 '18',
 '180',
 '1824',
 '1824.',
 '18yo',
 '1904',
 '1968',
 '1975/1983/1988',
 '1977',
 '1979',
 '1981',
 '1984',
 '1985',
 '1990',
 '1991',
 '1992',
 '1997.',
 '1998',
 '1ere',
 '1h30',
 '1st',
 '2',
 '2-3',
 '20',
 '20-25',
 '200',
 '2000',
 '2000.',
 '2001',
 '2003',
 '2004.',
 '2005',
 '2007',
 '2009',
 '2009.',
 '200eme',
 '200th',
 '2010',
 '2011',
 '2011.',
 '2013',
 '2014.',
 '2015',
 '2015.',
 '2016',
 '201

In [22]:
clf = text_clf.steps[1][1]

In [23]:
clf

LogisticRegression(solver='liblinear')