In [25]:
import numpy as np
import pandas as pd
import re, string

from contractions_re import * #for expandContractions(text)
from nltk.tokenize import word_tokenize, MWETokenizer
from nltk.chunk import ne_chunk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

from langdetect import detect
from textblob import TextBlob

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.cluster import KMeans

from joblib import dump, load
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
%matplotlib inline

# --- Stopwords
PRINTABLE = [i for i in string.printable]
GENREIC = ["ain't", "aren't", "can't", "can't've", "'cause", "could've", "couldn't", "couldn't've", "didn't", "doesn't", "don't", "hadn't", "hadn't've", "hasn't", "haven't", "he'd", "he'd've", "he'll", "he'll've", "he's", "how'd", "how'd'y", "how'll", "how's", "i'd", "i'd've", "i'll", "i'll've", "i'm", "i've", "isn't", "it'd", "it'd've", "it'll", "it'll've", "it's", "let's", "ma'am", "mayn't", "might've", "mightn't", "mightn't've", "must've", "mustn't", "mustn't've", "needn't", "needn't've", "o'clock", "oughtn't", "oughtn't've", "shan't", "sha'n't", "shan't've", "she'd", "she'd've", "she'll", "she'll've", "she's", "should've", "shouldn't", "shouldn't've", "so've", "so's", "that'd", "that'd've", "that's", "there'd", "there'd've", "there's", "they'd", "they'd've", "they'll", "they'll've", "they're", "they've", "to've", "wasn't", "we'd", "we'd've", "we'll", "we'll've", "we're", "we've", "weren't", "what'll", "what'll've", "what're", "what's", "what've", "when's", "when've", "where'd", "where's", "where've", "who'll", "who'll've", "who's", "who've", "why's", "why've", "will've", "won't", "won't've", "would've", "wouldn't", "wouldn't've", "y'all", "y'all'd", "y'all'd've", "y'all're", "y'all've", "you'd", "you'd've", "you'll", "you'll've", "you're", "you've", "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"]
SUBJECT_RELATED = ['cookbook', 'cookbooks', 'book', 'books', 'recipes', 'like', 'copyright', 'information','cook', 'home', 'best', 'food']
MISC_STOPWORDS = ['com', 'xa']
ADD_STOPWORDS = ["'caus", "'d", "'ll", "'m", "'re", "'s", "'ve", 'abov', 'afterward', 'ai', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'ca', 'cri', 'describ', 'did', 'doe', 'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifi', 'formerli', 'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'inform', 'latterli', 'let', 'mani', 'meanwhil', 'moreov', 'mostli', "n't", 'need', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 'ought', 'ourselv', 'perhap', 'pleas', 'recip', 'seriou', 'sever', 'sha', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 'wherev', 'whi', 'wo', 'yourselv']
COOKBOOK_STOPWORDS = PRINTABLE + GENREIC + SUBJECT_RELATED + MISC_STOPWORDS + ADD_STOPWORDS

cookbook_mwe = MWETokenizer([
        ('low', 'carb'),
        ('fat', 'loss'),
        ('fat', 'free'),
        ('gluten', 'free'),
        ('sugar', 'free'),
        ('low', 'fat'),
        ('meat', 'less'),
        ('instant', 'pot'),
        ('slow', 'cooker'),
        ('mini', 'bar'),
        ('dairy', 'free'),
        ('for', 'one'),
        ('cast', 'iron'),
        ('new', 'york'),
        ('san', 'francisco'),
        ('los', 'angeles')
])

In [26]:
pd_test = pd.read_csv("../model/tempDF.csv", index_col=0, header=None, names=["text"])
pd_test

Unnamed: 0,text
0,France: The Beautiful Cookbook- Authentic Reci...
1,"Italy, The Beautiful Cookbook: Authentic Recip..."
2,"California, The Beautiful Cookbook: Authentic ..."
3,America: The Beautiful Cookbook
4,Mexico The Beautiful Cookbook: Authentic Recip...
...,...
44831,"Joy Fit Club: Cookbook, Diet Plan & Inspiration"
44832,Drinking in America: Our Secret History
44833,Entertaining with Vegetables: A Recipe Collect...
44834,Grit Guide to Cast Iron Cooking Magazine 2016


In [32]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def tokenize_col(pd_series):
    """
    """
    # --- Helpers
    alphanumeric = lambda x: re.sub('[\d]+', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    remove_stopwords = lambda x: " ".join([word for word in x.split() if word not in COOKBOOK_STOPWORDS])
    wnl = WordNetLemmatizer()
    
    col = pd_series.name
    newDF = pd.DataFrame(pd_series)
    newDF['text_step1'] = newDF[col].apply(lambda x: expandContractions(x.lower()))
    newDF['text_step2'] = newDF['text_step1'].map(alphanumeric)
    newDF['text_step3'] = newDF['text_step2'].map(punc_lower)
    newDF['text_step4'] = newDF['text_step3'].apply(lambda x: " ".join(cookbook_mwe.tokenize(word_tokenize(x))))
    newDF['text_step5'] = newDF['text_step4'].map(remove_stopwords)
    newDF['text_step6'] = newDF['text_step5'].apply(lambda x: pos_tag(word_tokenize(x)))
    newDF['text_step7'] = newDF['text_step6'].apply(lambda x: [(word, get_wordnet_pos(pos)) for word, pos in x])
    newDF['text_step8'] = newDF['text_step7'].apply(lambda x: [wnl.lemmatize(word, pos) for word, pos in x])
    
    return newDF['text_step8']
    
    
#tokenized_cookbook = 
tokenized_cookbook = tokenize_col(pd_test["text"])
tokenized_cookbook

Unnamed: 0,text,text_step1,text_step2,text_step3,text_step4,text_step5,text_step6,text_step7,text_step8
0,France: The Beautiful Cookbook- Authentic Reci...,france: the beautiful cookbook- authentic reci...,france: the beautiful cookbook- authentic reci...,france the beautiful cookbook authentic reci...,france the beautiful cookbook authentic recipe...,france beautiful authentic regions france,"[(france, NN), (beautiful, JJ), (authentic, JJ...","[(france, n), (beautiful, a), (authentic, a), ...","[france, beautiful, authentic, region, france]"
1,"Italy, The Beautiful Cookbook: Authentic Recip...","italy, the beautiful cookbook: authentic recip...","italy, the beautiful cookbook: authentic recip...",italy the beautiful cookbook authentic recip...,italy the beautiful cookbook authentic recipes...,italy beautiful authentic regions italy,"[(italy, JJ), (beautiful, JJ), (authentic, JJ)...","[(italy, a), (beautiful, a), (authentic, a), (...","[italy, beautiful, authentic, region, italy]"
2,"California, The Beautiful Cookbook: Authentic ...","california, the beautiful cookbook: authentic ...","california, the beautiful cookbook: authentic ...",california the beautiful cookbook authentic ...,california the beautiful cookbook authentic re...,california beautiful authentic california,"[(california, NN), (beautiful, JJ), (authentic...","[(california, n), (beautiful, a), (authentic, ...","[california, beautiful, authentic, california]"
3,America: The Beautiful Cookbook,america: the beautiful cookbook,america: the beautiful cookbook,america the beautiful cookbook,america the beautiful cookbook,america beautiful,"[(america, RB), (beautiful, JJ)]","[(america, r), (beautiful, a)]","[america, beautiful]"
4,Mexico The Beautiful Cookbook: Authentic Recip...,mexico the beautiful cookbook: authentic recip...,mexico the beautiful cookbook: authentic recip...,mexico the beautiful cookbook authentic recip...,mexico the beautiful cookbook authentic recipe...,mexico beautiful authentic regions mexico,"[(mexico, NN), (beautiful, JJ), (authentic, JJ...","[(mexico, n), (beautiful, a), (authentic, a), ...","[mexico, beautiful, authentic, region, mexico]"
...,...,...,...,...,...,...,...,...,...
44831,"Joy Fit Club: Cookbook, Diet Plan & Inspiration","joy fit club: cookbook, diet plan & inspiration","joy fit club: cookbook, diet plan & inspiration",joy fit club cookbook diet plan inspiration,joy fit club cookbook diet plan inspiration,joy fit club diet plan inspiration,"[(joy, NN), (fit, NN), (club, NN), (diet, JJ),...","[(joy, n), (fit, n), (club, n), (diet, a), (pl...","[joy, fit, club, diet, plan, inspiration]"
44832,Drinking in America: Our Secret History,drinking in america: our secret history,drinking in america: our secret history,drinking in america our secret history,drinking in america our secret history,drinking america secret history,"[(drinking, NN), (america, NN), (secret, JJ), ...","[(drinking, n), (america, n), (secret, a), (hi...","[drinking, america, secret, history]"
44833,Entertaining with Vegetables: A Recipe Collect...,entertaining with vegetables: a recipe collect...,entertaining with vegetables: a recipe collect...,entertaining with vegetables a recipe collect...,entertaining with vegetables a recipe collecti...,entertaining vegetables recipe collection mode...,"[(entertaining, VBG), (vegetables, NNS), (reci...","[(entertaining, v), (vegetables, n), (recipe, ...","[entertain, vegetable, recipe, collection, mod..."
44834,Grit Guide to Cast Iron Cooking Magazine 2016,grit guide to cast iron cooking magazine 2016,grit guide to cast iron cooking magazine,grit guide to cast iron cooking magazine,grit guide to cast_iron cooking magazine,grit guide cast_iron cooking magazine,"[(grit, NN), (guide, NN), (cast_iron, NN), (co...","[(grit, n), (guide, n), (cast_iron, n), (cooki...","[grit, guide, cast_iron, cook, magazine]"


In [18]:
tokenized_cookbook.sample(20)

37405                                          onion lover
11         wine journal wine lover album cellaring tasting
21999    wafer paper cakes modern cake designs techniqu...
25265                 delicious dump cakes quick easy make
14863                                     taste loved pies
25523    super tuscan heritage simple pleasures kitchen...
11792                              vegetarian table france
44821                                                 life
21234    eat fish happy salmon halibut celebrate taste ...
23041    cooking krishna gluten_free vegan indian cuisi...
5289     andrew favorite soups healthy veggies bundle m...
17067                         brothers volume iii brothers
15404    fast natural cuisine complete guide easy veget...
10754                               cheesecakes appetizers
24960                                       slim man cooks
44408    saveur magazine march number morocco cuisine m...
4980                                         dorie cooki