In [1]:
import pandas as pd
import json
input_name = "warmingdrawers_raw"
output_name = f"{input_name.split('_')[0]}_extracted.csv"
output_name2 = f"{input_name.split('_')[0]}_clean.csv"

# Remove parentheses around input_name in the file path
df_unclean = pd.read_csv(f"{input_name}.csv")

In [2]:
# Total number of rows and columns in the DataFrame
total_rows, total_columns =df_unclean.shape
print("Total number of rows:", total_rows)
print("Total number of columns:", total_columns)

# Total number of unique 'id's
total_unique_ids = df_unclean['id'].nunique()
print("Total number of unique 'id's:", total_unique_ids)


Total number of rows: 14
Total number of columns: 67
Total number of unique 'id's: 14


In [3]:
# Assuming 'ean' is the column name in your DataFrame
total_rows_ean = df_unclean['ean'].shape[0]
print("Total number of rows in 'ean' column:", total_rows_ean)


missing_values_ean = df_unclean['ean'].isnull().sum()
print("Total number of missing values in 'ean' column:", missing_values_ean)

# Total number of unique 'id's
total_unique_ids = df_unclean['ean'].nunique()
print("Total number of unique 'ean's:", total_unique_ids)

Total number of rows in 'ean' column: 14
Total number of missing values in 'ean' column: 10
Total number of unique 'ean's: 4


In [4]:
df_unclean.head()

Unnamed: 0,id,apiURLs,asins,brand,canonicalBrand,categories,colors,count,dateAdded,dateUpdated,...,taxonomyLevel6,taxonomyLevel7,taxonomyLevel8,taxonomyLevel9,upc,upce,upca,vin,websiteIDs,weight
0,BCBhtncBUbef_7uT2Ypd,,,discoverzone,,"Home Living,Home & Tools,Vacuum Accessories,Va...",,,2021-02-18T18:23:21Z,2022-06-08T17:59:13Z,...,,,,,,,,,,
1,yfW0w3YBPkqI1fUQeUxD,,,xfactory,,"Major Appliances,Washer & Dryer Accessories,El...",,,2021-01-02T15:25:53Z,2021-10-11T09:10:38Z,...,,,,,,,,,,
2,yFIZcHYBo6SyyXo09mtg,,,xfactory,,"Major Appliances,Washer & Dryer Accessories,El...",,,2020-12-17T09:48:38Z,2021-08-03T07:46:43Z,...,,,,,,,,,,
3,AalsO3UBUbef_7uT-NTg,,,,,"Cooktops,Appliances,Ranges, Cooktops & Ovens",,,2020-10-18T11:16:38Z,2021-06-26T00:14:16Z,...,,,,,,,,,,
4,3bqDYHUBUbef_7uTQ4cD,,,,,"Wall Ovens,Double Wall Ovens,Appliances,Ranges...",,,2020-10-25T16:06:56Z,2021-05-11T19:43:11Z,...,,,,,883049100000.0,,883049100000.0,,,


In [5]:
df_unclean.columns

Index(['id', 'apiURLs', 'asins', 'brand', 'canonicalBrand', 'categories',
       'colors', 'count', 'dateAdded', 'dateUpdated', 'descriptions',
       'dimension', 'domains', 'ean', 'ean8', 'ean13', 'features',
       'financingAndLeasing', 'flavors', 'gtins', 'imageURLs', 'isbn', 'keys',
       'manufacturer', 'manufacturerNumber', 'merchants',
       'mostRecentPriceAmount', 'mostRecentPriceNonSalesAmount',
       'mostRecentPriceAvailability', 'mostRecentPriceCurrency',
       'mostRecentPriceColor', 'mostRecentPriceSize',
       'mostRecentPriceCondition', 'mostRecentPriceIsSale',
       'mostRecentPriceDomain', 'mostRecentPriceSourceURL',
       'mostRecentPriceDate', 'mostRecentPriceFirstDateSeen',
       'mostRecentPriceByDomain', 'name', 'prices', 'primaryCategories',
       'primaryImageURLs', 'quantities', 'reviews', 'sdsURLs',
       'secondaryCategories', 'sizes', 'skus', 'sourceURLs', 'stockNum',
       'taxonomy', 'taxonomyLevel1', 'taxonomyLevel2', 'taxonomyLevel3',
    

In [6]:
# Sample DataFrame
data = df_unclean[['id' , 'ean' , 'manufacturerNumber' , 'upc' , 'features']]
df = pd.DataFrame(data)


In [7]:
df.head()

Unnamed: 0,id,ean,manufacturerNumber,upc,features
0,BCBhtncBUbef_7uT2Ypd,,TX381833136802,,
1,yfW0w3YBPkqI1fUQeUxD,,FX381833136802,,
2,yFIZcHYBo6SyyXo09mtg,,AT381833136802,,
3,AalsO3UBUbef_7uT-NTg,,,,
4,3bqDYHUBUbef_7uTQ4cD,883049100000.0,,883049100000.0,"[{""key"":""Cutout Depth (in.)"",""value"":[""24""]},{..."


In [8]:
import json
import numpy as np
# Function to extract keys from 'features' column
def extract_keys(row):
    try:
        features = row['features']
        if isinstance(features, str):
            features_list = json.loads(features)
            keys = [item['key'] for item in features_list]
        else:
            keys = []
    except (json.JSONDecodeError, TypeError):
        keys = []
    return keys

# Apply function to each row
df['list_of_keys_from_features'] = df.apply(extract_keys, axis=1)

df.head(5)


Unnamed: 0,id,ean,manufacturerNumber,upc,features,list_of_keys_from_features
0,BCBhtncBUbef_7uT2Ypd,,TX381833136802,,,[]
1,yfW0w3YBPkqI1fUQeUxD,,FX381833136802,,,[]
2,yFIZcHYBo6SyyXo09mtg,,AT381833136802,,,[]
3,AalsO3UBUbef_7uT-NTg,,,,,[]
4,3bqDYHUBUbef_7uTQ4cD,883049100000.0,,883049100000.0,"[{""key"":""Cutout Depth (in.)"",""value"":[""24""]},{...",[]


In [9]:
df.dtypes

id                             object
ean                           float64
manufacturerNumber             object
upc                           float64
features                       object
list_of_keys_from_features     object
dtype: object

In [10]:
df.to_csv(output_name , index=False)

In [11]:
total_rows, total_columns = df.shape
print("Total number of rows:", total_rows)
print("Total number of columns:", total_columns)

Total number of rows: 14
Total number of columns: 6


In [12]:
# Remove rows where column "list_of_keys_from_features" has an entry "[]"
df = df[df['list_of_keys_from_features'].apply(len) > 0]

# Check total number of rows and columns of the DataFrame again
total_rows_after_removal, total_columns_after_removal = df.shape
print("Total number of rows after removal:", total_rows_after_removal)
print("Total number of columns after removal:", total_columns_after_removal)

Total number of rows after removal: 3
Total number of columns after removal: 6


In [13]:
df.to_csv(output_name2, index=False)