In [39]:
import os
import pandas as pd
import numpy as np
from tabulate import tabulate

### Defining functions for meta data summary stats

In [40]:
def _replace_empty_strings_with_nan(meta_data_df):
    """
    Given a dataframe, replace all empty strings with np.nan
    """
    return meta_data_df.replace(r'^\s*$', np.NaN, regex=True)


def _main_cat_uniques(meta_data_df):
    """
    Given the meta data dataframe, return count(unique main categories), unique values, count(null)
    """
    _value_counts = meta_data_df['main_cat'].value_counts()
    _nulls = np.count_nonzero(pd.isna(meta_data['main_cat']))
    return len(_value_counts), _value_counts.index.values, _nulls


def _brands_uniques(mata_data_df):
    """
    Given the meta data dataframe, return count(unique brands), count(null)
    """
    _value_counts = meta_data_df['brand'].nunique()
    _nulls = np.count_nonzero(pd.isna(meta_data_df['brand']))
    return _value_counts, _nulls


def _category_enumerate_stats(meta_data_df):
    """
    Given the meta data dataframe, make individual columns based on the heirarchy in the 
    category feild, and report stats on that.
    Returns level0_uniques, level0_non_null_count, level1_uniques, level1_non_null_count, level2_uniques, level2_non_null_count
    """

def get_meta_data_stats(meta_folder_path, output_path=None, file_names=None):
        try:
            if not file_names:
                file_names = os.listdir(meta_folder_path)

            _result = list()
            for file_name in file_names:
                _temp_dict = dict()
                _file_path = '/'.join([meta_folder_path, file_name])
                data = list()
                with open(_file_path) as fp:
                    for l in fp:
                        data.append(json.loads(l.strip()))
                
                meta_data_df = pd.DataFrame(data)
                meta_data_df = _replace_empty_strings_with_nan(meta_data_df)
                
                _temp_dict['main_cat_unique'], _temp_dict['main_cat_unique_values'], _temp_dict['main_cat_nulls'] = _main_cat_uniques(meta_data_df)
                _temp_dict['brands_unique'], _temp_dict['brands_nulls'] = _brands_uniques(meta_data_df)
                _temp_dict['total_products'] = len(meta_data_df)
                _temp_dict['unique_products'] = meta_data_df['asin'].nunique()
                _temp_dict['products_with_title'] = np.count_nonzero(~pd.isna(meta_data_df['title']))
                _temp_dict['dataset'] = file_name
                _temp_dict['category_nulls_when_main_cat_present'] = np.count_nonzero(~pd)
                _result.append(_temp_dict)
            result_df = pd.DataFrame(_result)
            if output_path:
                result_df.to_csv(output_path)
            else:
                print(tabulate(result_df, headers='keys', tablefmt='psql'))
        except Exception as e:
            print(Exception)
        

In [46]:
k = pd.DataFrame([[1,np.nan,' s'],[1,1,1]], columns=['a','b','c'])
k

Unnamed: 0,a,b,c
0,1,,s
1,1,1.0,1


In [50]:
replace_empty_strings_with_nan(k).iloc[0,0]

1

In [51]:
import json
import pandas as pd
import numpy as np

meta_file_path = "/home/bigdata/Desktop/projects/Rec_Sys/file_server/dataset/_magazine_dataset/meta_Magazine_Subscriptions.json"
ratings_file_path = "/home/bigdata/Desktop/projects/Rec_Sys/file_server/dataset/_magazine_dataset/Magazine_Subscriptions.csv"

data = list()

with open(meta_file_path) as fp:
    for l in fp:
        data.append(json.loads(l.strip()))

meta_data = pd.DataFrame(data)
ratings = pd.read_csv(ratings_file_path, names = ['asin','u_id','rating'])

In [10]:
ratings.head()

Unnamed: 0,asin,u_id,rating
0,B00005N7P0,AH2IFH762VY5U,5.0
1,B00005N7P0,AOSFI0JEYU4XM,5.0
2,B00005N7OJ,A3JPFWKS83R49V,3.0
3,B00005N7OJ,A19FKU6JZQ2ECJ,5.0
4,B00005N7P0,A25MDGOMZ2GALN,5.0


In [28]:
ratings.head().to_csv("papa.csv")

In [76]:
meta_data.tail()

Unnamed: 0,category,description,also_buy,image,brand,also_view,details,main_cat,asin,rank,title
3488,"[Magazine Subscriptions, Sports, Recreation & ...","[Built for the true classic car enthusiast, ea...","[B002PXVYO6, B01FIR6AJ4, B00006KGT1, B01DPP7H5...",[https://images-na.ssl-images-amazon.com/image...,"Motorsport Marketing, Inc.","[B01FV4YCNA, B00006KGT1, B002PXVYO6, B0047VIAI...","\n\n\n\n\n <div class=""disclaim"">Subscripti...",Magazine Subscriptions,B01HI8V1I6,,
3489,"[Magazine Subscriptions, Fashion & Style]",[InStyle celebrates the private side of public...,,,Meredith Corporation,,"\n\n\n\n\n <div class=""disclaim"">Subscripti...",Magazine Subscriptions,B01HI8V0ZK,,
3490,,[Sports Illustrated brings you spectacular act...,,,Meredith Corporation,,"\n\n\n\n\n <div class=""disclaim"">Subscripti...",Magazine Subscriptions,B01HI8V1C2,,
3491,,[Get fresh ideas and inspiration on how to mak...,,,Meredith Corporation,,"\n\n\n\n\n <div class=""disclaim"">Subscripti...",Magazine Subscriptions,B01HI8V1MC,,
3492,"[Magazine Subscriptions, Home & Garden, Antiqu...",,"[B075HJBL8L, B01N184C5W, B01912KIV8, B0098LC1L...",,Greybird Publishers LLC,"[B00HRF3A0S, B01912KIV8, B075HJBL8L, B01N184C5...","\n <div class=""content"">\n\n\n\n\n\n\n\n<ul>\...",Buy a Kindle,B01HIZSSQM,"4,696,642PaidinKindleStore(",var aPageStart = (new Date()).getTime();\nvar ...


### Check how many main category values are there

In [52]:
meta_data['main_cat'].value_counts()

Magazine Subscriptions    3386
Buy a Kindle               107
Name: main_cat, dtype: int64

### Check how many 'category' values are nan

In [53]:
np.count_nonzero(~pd.isna(meta_data['category']))

2991

### Takeaway, top level category is captured in 'main_cat' and 'category' captures other sub categories. So we separate out cases where 'category' field is non empty and enumerate the subcategories as columns and finally concat with cases where category field was empty (ie they only have a top level category given in 'main_cat') 
* preserve index for concat
* rename sub_cats according to level

In [54]:
meta_data_with_sub_cat = meta_data[~pd.isna(meta_data['category'])]
meta_data_without_sub_cat = meta_data[pd.isna(meta_data['category'])]

meta_data_with_sub_cat.info(), meta_data_without_sub_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2991 entries, 0 to 3492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     2991 non-null   object
 1   description  2545 non-null   object
 2   also_buy     1630 non-null   object
 3   image        1402 non-null   object
 4   brand        2825 non-null   object
 5   also_view    2100 non-null   object
 6   details      2991 non-null   object
 7   main_cat     2991 non-null   object
 8   asin         2991 non-null   object
 9   rank         1898 non-null   object
 10  title        262 non-null    object
dtypes: object(11)
memory usage: 280.4+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 502 entries, 83 to 3491
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     0 non-null      object
 1   description  366 non-null    object
 2   also_buy     78 non-null     object
 3   

(None, None)

In [55]:
additional_columns = pd.DataFrame(meta_data_with_sub_cat['category'].to_list())
additional_columns[~pd.isna(additional_columns[5])].sample(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
2465,Magazine Subscriptions,Religion & Spirituality,Christianity,Kindle (5th Generation),Kindle Keyboard,Kindle DX,Kindle (2nd Generation),Kindle (1st Generation),Kindle Paperwhite,Kindle Paperwhite (5th Generation),...,Fire HD 8,Fire,Fire Phone,(Version 4.1 or later),(Version 4.1 or later),(Version 3.0 or later),(Version 3.0 or later),(Version 2.9 or later),,
2679,Magazine Subscriptions,Lifestyle & Cultures,,,,,,,,,...,,,,,,,,,,
2780,Magazine Subscriptions,Lifestyle & Cultures,,,,,,,,,...,,,,,,,,,,
2903,Magazine Subscriptions,Fashion & Style,,,,,,,,,...,,,,,,,,,,
2522,Magazine Subscriptions,"Arts, Music & Photography",Graphic Design,Kindle (5th Generation),Kindle Keyboard,Kindle DX,Kindle (2nd Generation),Kindle (1st Generation),Kindle Paperwhite,Kindle Paperwhite (5th Generation),...,Fire HD 8,Fire,Fire Phone,(Version 4.1 or later),(Version 4.1 or later),(Version 3.0 or later),(Version 3.0 or later),(Version 2.9 or later),,
2775,Magazine Subscriptions,Entertainment & Pop Culture,,,,,,,,,...,,,,,,,,,,
2752,Magazine Subscriptions,Lifestyle & Cultures,,,,,,,,,...,,,,,,,,,,
2759,Magazine Subscriptions,Automotive & Motorcycles,,,,,,,,,...,,,,,,,,,,
2801,Magazine Subscriptions,"Arts, Music & Photography",,,,,,,,,...,,,,,,,,,,
2948,Magazine Subscriptions,Fashion & Style,Men,,,,,,,,...,,,,,,,,,,


In [69]:
np.isnan(np.array(additional_columns.iloc[2775,0]))

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [72]:
np.isnan(23)

False

In [83]:
np.count_nonzero(~pd.isna(meta_data['title']))

329