In [42]:
import firebase_kapp_api as f
import pandas as pd
import datetime
import time
import json
import math
from hashlib import md5

metadata_file = 'metadata.json'
data_file = 'public/data.js'
DATETIME_FORMAT='%Y.%m.%dT%H:%M:%S%z'

In [91]:
csv_files = {
    'babymat': 'babymat.csv',
    'bakeartikler': 'bakeartikler.csv',
    'bakevarer': 'bakevarer.csv',
    'div_matprodukter': 'div_matprodukter.csv',
    'drikke': 'drikke.csv',
    'fisk': 'fisk.csv',
    'frukt_gront': 'frukt_gront.csv',
    'helsekost': 'helsekost.csv',
    'is_dessert': 'is_dessert.csv',
    'meieri_ost': 'meieri_ost.csv',
    'palegg_frokost': 'palegg_frokost.csv',
    'snacks_godteri': 'snacks_godteri.csv',
}

In [73]:
def hashit(str):
    return md5(str.encode()).hexdigest()

In [89]:
def enrich_records(df, timestamp):
    """
    Adds an extra column so search will be easier.
    You can use regex over a set of keys.
    """
    df['updated_ts'] = timestamp
    df['search'] = (
        df['category'] 
        + df['sub_category'] 
        + df['producer'] 
        + df['product_type'] 
        + df['product'] 
        + df['kosher_type'] 
        + df['kosher_stamp'] 
        + df['comment']
    )
    df['fingerprint'] = df['search'].apply(hashit)
    
    return df

In [53]:
s = pd.Series()
s.to_string

In [20]:
def read_json(filename):
    """
    Returns: A dict
    """
    with open(filename, "r") as read_file:
        data = json.load(read_file)
    return data

In [95]:
def save_file(filename, data):
    print('Saving', filename)
    print(data, file=open(filename, 'w'))

In [22]:
def get_updated_minor_version():
    curr = read_json(metadata_file)['version'].split('.')
    major, minor = curr[0], curr[1]
    new = major + "." + str(int(minor) + 1)    
    return new

In [96]:
def meta_data():
    updated_ts = time.strftime(DATETIME_FORMAT, time.localtime())
    print('Updated', updated_ts)
    metadata = {
        'version': get_updated_minor_version(),
        'updated_ts': updated_ts
    }
    save_file(metadata_file, json.dumps(metadata))        
    return metadata

In [99]:
def create_local_db(csv_files):
    """
    Creates an JavaScript array of objects to be copied to the application.
    This is to use a local array instead of using the Firestore.
    """
    metadata = meta_data();
    df = pd.DataFrame()
    for k,v in csv_files.items():
        records = read_data('data/' + v)
        records = enrich_records(records, metadata['updated_ts'])
        df = pd.concat([df, records], ignore_index=True)
        
    records_json = df.to_json(orient='records')
    data = (
        'const data = {'
        + '"metadata":' + json.dumps(metadata)
        + ', "records":' + records_json
        + '}'
    )
    save_file(data_file, data)
    return data
    
create_local_db(csv_files);

Updated 2018.12.30T23:34:38+0100
Saving metadata.json
------ Reading data/babymat.csv
------ Reading data/bakeartikler.csv
------ Reading data/bakevarer.csv
------ Reading data/div_matprodukter.csv
------ Reading data/drikke.csv
------ Reading data/fisk.csv
------ Reading data/frukt_gront.csv
------ Reading data/helsekost.csv
------ Reading data/is_dessert.csv
------ Reading data/meieri_ost.csv
------ Reading data/palegg_frokost.csv
------ Reading data/snacks_godteri.csv
Saving public/data.js


In [93]:
find_duplicates_all(csv_files)
count_products()

------ Reading data/babymat.csv
------ Reading data/bakeartikler.csv
------ Reading data/bakevarer.csv
------ Reading data/div_matprodukter.csv
------ Reading data/drikke.csv
------ Reading data/fisk.csv
------ Reading data/frukt_gront.csv
------ Reading data/helsekost.csv
------ Reading data/is_dessert.csv
------ Reading data/meieri_ost.csv
------ Reading data/palegg_frokost.csv
------ Reading data/snacks_godteri.csv
------ Finding Duplicates


Unnamed: 0,category,sub_category,producer,product_type,product,kosher_type,kosher_stamp,comment
1017,is & dessert,iskrem,Diplom-is,småis,Fruitero mango,p,-,-
1067,is & dessert,iskrem,Diplom-is,literis i butikk,Fruitero mango,m,-,-
1030,is & dessert,iskrem,Diplom-is,småis,Mini- pia jordbær,m,-,-
1072,is & dessert,iskrem,Diplom-is,literis i butikk,Mini- pia jordbær,m,-,-
1038,is & dessert,iskrem,Diplom-is,småis,Royal pistasj,m,-,-
1075,is & dessert,iskrem,Diplom-is,literis i butikk,Royal pistasj,m,-,-
995,is & dessert,dessert,Freia,-,Dessertsjokolade,m,-,-
1552,snacks & godteri,godteri og sjokolade,Freia,-,Dessertsjokolade,m,-,-
47,bakeartikler,-,Freia,-,Flerfarget strøssel,p,-,-
1555,snacks & godteri,godteri og sjokolade,Freia,-,Flerfarget strøssel,p,-,-


------ Count Products
babymat.csv: 20
bakeartikler.csv: 129
bakevarer.csv: 258
div_matprodukter.csv: 209
drikke.csv: 128
fisk.csv: 103
frukt_gront.csv: 23
helsekost.csv: 105
is_dessert.csv: 312
meieri_ost.csv: 193
palegg_frokost.csv: 58
snacks_godteri.csv: 281
---
All files: 1819


In [29]:
def read_data(csv_file):
    """
    Local
    Also presents some information about the data.
    Returns: A Panda DataFrame.
    """
    print("------ Reading", csv_file)
    df = pd.read_csv(csv_file)
    return df

In [28]:
def count_products(localOnly = True):
    """
    Firestore & Local
    Count products in CSV files and in the database.
    """
    print("------ Count Products")
    csv_len_sum = 0
    for k,v in csv_files.items():
        df_len = len(pd.read_csv('data/' + v))
        csv_len_sum = csv_len_sum + df_len
        print("{}: {}".format(v, df_len))
    print("---")
    print("All files: {}".format(csv_len_sum))
    if not localOnly:
        print("Database: {}".format(f.nofProducts()))

In [26]:
def find_duplicates_df(df):
    """
    Local
    Find duplicates in a dataframe (local data from CSV files).
    Duplicates is checked on producer+product.
    """
    print('------ Finding Duplicates')
    dups = df.duplicated(subset=['producer','product'], keep=False) 
    df_dups = df[dups].sort_values(by=['producer','product'])
    display(df_dups)

In [27]:
def find_duplicates_all(csv_files):
    """
    Local
    See find_duplicates()
    """
    df = pd.DataFrame()
    for k,v in csv_files.items():
        df = pd.concat([df, read_data('data/' + v)], ignore_index=True)
    find_duplicates_df(df)

In [None]:
def insert_from_files(csv_files):
    """
    Firestore & local
    Insert to Firestore from a set of CSV files.
    """
    for k,v in csv_files.items():
        df = read_data('data/' + v)
#         insert_data(df)

# insert_from_files(csv_files)
# count_products()

In [24]:
def insert_from_single_file(csv_file):
    """
    Firestore & local
    Insert to Firestore from a single CSV file.
    """
    df = read_data('data/' + csv_file)
    find_duplicates(df)
    # insert_data(df)
    count_products()
    
# insert_from_single_file(csv_files['frukt_gront'])

In [None]:
def insert_data(df):
    """
    Firestore
    Creates products in the Firebase database.
    Input: A DataFrame
    """
    print("------ Batch Insert Data")
    f.batch_create(df)

In [None]:
# f.list_products(False)

In [None]:
# %%time
def get_docs():
    """
    Firestore
    """
    docs = f.products_ref().get()
    products = []
    for doc in docs:
        products.append(doc.to_dict())
    return products

# p = get_docs()
# p

### Sandbox

In [None]:
# f.create({
#     u'category': u'REMOVE',
#     u'comment': u'',
#     u'kosher_stamp': u'',
#     u'kosher_type': u'p',
#     u'producer': u'Toro',
#     u'product': u'suppe',
#     u'sub_category': u'',
# })

In [None]:
# doc_ref1 = f.get_doc_ref(u'-_\n')
# f.pp(doc_ref1)

In [None]:
# doc_ref1 = f.get_doc_ref(u'J7Rh7IcdEZ3Sxnb0N1yO')
# doc_ref2 = f.delete(f.get_doc_ref(u'J7Rh7IcdEZ3Sxnb0N1yO'))
# doc_ref3 = f.get_doc_ref(u'213132')
# f.update(doc_ref3, {u'comment':'Bye'})

In [None]:
# doc_ref1 = f.get_doc_ref(u'J7Rh7IcdEZ3Sxnb0N1yO')
# doc_ref2 = f.delete(f.get_doc_ref(u'nTk2VlJoA9MOXxZuMrBs'))

# f.pp(doc_ref1)
# display(f.isDeleted(doc_ref1))
# display(f.exists(doc_ref1))