In [24]:
from nltk.corpus import brown
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from string import punctuation
from rake_nltk import Rake
import json
import nltk

nltk.download('brown')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

with open("en.json") as json_data:
    en_json = json.load(json_data)

stopwords_json_en = set(en_json)
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
# Combine the stopwords. 
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

porter = PorterStemmer()
wnl = WordNetLemmatizer()

import numpy as np
from tqdm import tqdm
import pandas as pd
import os, json
import pandas.io.json as pd_json
pd.set_option('display.max_columns', None)

[nltk_data] Error loading brown: <urlopen error [Errno 61] Connection
[nltk_data]     refused>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 61] Connection refused>
[nltk_data] Error loading wordnet: <urlopen error [Errno 61]
[nltk_data]     Connection refused>


In [25]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'

def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def lemmatize_sent_with_rake(text):
    rake = Rake(stopwords=set.union(stopwords_json_en, stopwords_nltk_en),
                punctuations = stopwords_punct,language = 'English')
    rake.extract_keywords_from_text(text)
    key_words = rake.get_ranked_phrases()
    # pos_tag(key_words)
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(key_words)]

def to_count_vec(string):
    analysis = [word for word in lemmatize_sent(string) 
       if word not in stoplist_combined
       and not word.isdigit() ]
    return analysis

def to_count_vec_with_rake(string):
    analysis = [word for word in lemmatize_sent_with_rake(string) 
       if word not in stoplist_combined
       and not word.isdigit() ]
    return analysis

In [26]:
recipes = pd.read_csv('recipes.csv', sep=",", header=0)
recipes.drop('url',axis = 1, inplace = True)

In [27]:
def processing(file_name):    
    columns_list = ['name_1','name_2','name_3','name_4', 'alcohol', 'origins', 'description','allergens','ingredients','nutrition','salePrice','savePrice']
    file = open(file_name, 'r', encoding='utf-8')
    df = pd.DataFrame()
    for line in tqdm(file.readlines()):
        dic = json.loads(line)
        tmp = pd_json.json_normalize(dic)
        tmp.columns = tmp.columns.map(lambda x: x.split(".")[-1])

        cols = []
        count = 1
        for column in tmp.columns:
            if column == 'name':
                cols.append(f'name_{count}')
                count+=1
                continue
            cols.append(column)
        tmp.columns = cols
        tmp = tmp.loc[:,tmp.columns.isin(columns_list)]
        if tmp.shape[1] == len(columns_list):
            df = df.append(tmp, ignore_index = True,sort=False)
    return df


In [28]:
# browse = pd.read_json('browse.json', lines = True)
# df = df.loc[:,df.columns.isin(['name1', 'origins', 'description'])]

In [6]:
# browse_df = processing('browse.json')
# special_df = processing('specials.json')
# browse_df.to_csv('browse_df.csv', sep=',', header=True, index=True)
# special_df.to_csv('special_df.csv', sep=',', header=True, index=True)

In [7]:
browse_df = pd.read_csv('browse_df.csv',index_col = 0)
special_df = pd.read_csv('special_df.csv',index_col = 0)
browse_df.fillna(0, inplace = True)
special_df.fillna(0, inplace = True)

In [8]:
# df.iloc[0, df.columns.get_loc('COL_NAME')] = x


In [9]:
def nltk_processing(dataframe, columns):
    for column in columns:
        for index in range(dataframe.shape[0]):
            content = dataframe.iloc[index, dataframe.columns.get_loc(column)]
            if type(content) is not str:
                continue
            if (content != '' or content != None or content != 0):
                content = to_count_vec_with_rake(content)
            dataframe.iloc[index, dataframe.columns.get_loc(column)] = ''.join(content)
    return dataframe

In [10]:
df1 = nltk_processing(browse_df,["description"])
df2 = nltk_processing(special_df,["description"])
df3 = nltk_processing(recipes, ["ingredients","methods"])

In [11]:
def refine(df, columns, df_name):
    array = []
    for i in range(df.shape[0]):
        dic = {}
        for column in columns:
            if column == "name_1" or column == "description":
                if type(df.iloc[i][column]) is str:
                    dic[column] = df.iloc[i][column].split()
            else:
                dic[column] = df.iloc[i][column]
        array.append(dic)
    with open(f'{df_name}.json', 'w',encoding='utf-8') as f:
        json.dump(array, f)
    json_df = pd.read_json(f'{df_name}.json')
    return json_df

In [12]:
s_df = refine(special_df, ["name_1","description","name_2","name_3","name_4"],"special_df")
b_df = refine(browse_df, ["name_1","description","name_2","name_3","name_4"],"browse_df")

In [14]:
# pd.set_option('display.max_rows', None)

In [23]:
special_df.head(150)

Unnamed: 0,name_1,alcohol,origins,description,allergens,ingredients,nutrition,name_2,name_3,name_4,salePrice,savePrice
0,dine wet cat food cuts in gravy with turkey,0.0,0,premium quality ingredientsfeline friend pawin...,0,0,0,Pet,Cats,Wet Cat Food,1.39,0.0
1,whiskas kitten cat food with chicken in gravy,0.0,0,whiskas kitten wet cat food pouches12 months w...,0,0,0,Pet,Cats,Wet Cat Food,7.0,1.79
2,whiskas adult dry cat food meaty selection,0.0,[],delicious pockets make whiskas ® foodsoft meat...,[],"['Chicken By-product Meal', 'Ground Yellow Cor...","[{'servings': 'Serving/pack: Serving size: ',...",Pet,Cats,Dry Cat Food,23.0,3.99
3,dine wet cat food saucy morsels with ocean fish,0.0,0,premium quality ingredientsfeline friend pawin...,0,0,0,Pet,Cats,Wet Cat Food,1.39,0.0
4,fancy feast wet cat food tender beef & chicken...,0.0,0,zealand based pet care advisor todaydelectable...,0,0,0,Pet,Cats,Wet Cat Food,1.2,0.19
5,instore bakery croissants,0.0,0,0,0,0,0,Bakery,Baked In Store,Pastries & Scones,1.8,0.2
6,countdown lamb leg bone in,0.0,0,0,0,0,0,Meat & Seafood,Lamb,Roasting Joints,12.9,8.1
7,countdown prosciutto,0.0,['Packed in New Zealand from imported ingredie...,0,['Contains Milk.'],"['Pork', 'Salt', 'Dried Glucose Syrup', 'Dextr...",[{'servings': 'Serving/pack: 4 Serving size: 2...,Fridge & Deli,Deli Meats & Seafood,"Salami, Cured & Dried Meats",7.99,1.0
8,naturli vegan chick free plant based nibbles p...,0.0,0,0,0,0,0,Fridge & Deli,Vegan & Vegetarian,Sausages & Burgers,10.0,1.0
9,first light beef frying wagyu rib eye steak,0.0,['Made in New Zealand'],0,[],['Wagyu Beef'],0,Meat & Seafood,Beef,Steak,19.0,0.99


In [29]:
special_df = pd.read_csv('special_df.csv',index_col = 0)


In [45]:
special_df.iloc[30]['description']

'Classic combination of chunky prawns with ginger encased in a silky dumpling pastry.'