In [1]:
import boto3
import pandas as pd
import json
import re
from io import StringIO
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# AWS Credentials (though not needed for this step, we'll keep them for future uploads)
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_REGION = os.getenv('AWS_REGION')
S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# Function to read dataset from S3
def read_dataset_from_s3(bucket_name, file_key):
    s3 = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION
    )
    
    csv_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    body = csv_obj['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(body))
    return df

# Load dataset from S3
file_key = 'recipes_data/recipes_data.csv'  
df = read_dataset_from_s3(S3_BUCKET_NAME, file_key)

# Function to handle missing values
def handle_missing(value):
    if pd.isnull(value) or value == '' or value == 'NaN' or value == 'character(0)' or value == 'nan':
        return None
    return value

# Function to parse c("...") formatted lists and convert them to arrays
def parse_list_field(field_value):
    if isinstance(field_value, str):
        if field_value.startswith('c('):
            # Extract values within c("...") using a regular expression
            values = re.findall(r'"([^"]+)"', field_value)
            return [v.replace('\\"', '').strip() for v in values]  # Remove extra backslashes and quotes
        else:
            # Single value, wrap it in an array, removing any surrounding backslashes/quotes
            single_value = field_value.replace('\\"', '').replace('"', '').strip()  # Remove unnecessary slashes/quotes
            return [single_value]  # Ensure it's wrapped in quotes as a single item in the array
    return []

# Function to clean all columns
def clean_all_columns(df):
    # Apply cleaning for all columns
    for column in df.columns:
        # First, handle missing values for each column
        df[column] = df[column].apply(lambda x: handle_missing(x))
        
        # Then, for columns that might need list handling (like Images), we use the parse_list_field function
        df[column] = df[column].apply(lambda x: json.dumps(parse_list_field(x)) if isinstance(x, str) and ('c(' in x or 'http' in x) else x)
    
    return df

# Clean all columns
df_cleaned = clean_all_columns(df)

# Let's view the cleaned data (print the first few rows)
print(df_cleaned.head(50))

# Alternatively, display entire DataFrame to inspect
#pd.set_option('display.max_columns', None)  # Uncomment this to view all columns in the print statement
#pd.set_option('display.max_rows', None)     # Uncomment this to view all rows in the print statement

# Display the first 50 records of Name and Images columns
#for index, row in df_cleaned[['Name', 'Images']].head(50).iterrows():
    #print(f"Recipe: {row['Name']}")
    #print(f"Images: {row['Images']}")


  from pandas.core import (
  df = pd.read_csv(StringIO(body))


    RecipeId Barcode                                            Name  \
0       38.0    *38*               Low-Fat Berry Blue Frozen Dessert   
1       39.0    *39*                                         Biryani   
2       40.0    *40*                                   Best Lemonade   
3       41.0    *41*                  Carina's Tofu-Vegetable Kebabs   
4       42.0    *42*                                    Cabbage Soup   
5       43.0    *43*                            Best Blackbottom Pie   
6       44.0    *44*                          Warm Chicken A La King   
7       45.0    *45*      Buttermilk Pie With Gingersnap Crumb Crust   
8       46.0    *46*                         A Jad - Cucumber Pickle   
9       47.0    *47*                            Butter Pecan Cookies   
10      48.0    *48*                                Boston Cream Pie   
11      49.0    *49*                        Chicken Breasts Lombardi   
12      50.0    *50*                               Biscotti Di P

In [2]:
from IPython.display import Image, display

# Display the first 5 recipes with their images
for index, row in df_cleaned.head(20).iterrows():
    print(f"Recipe: {row['Name']}")
    
    # Try to display images if available
    images = json.loads(row['Images']) if row['Images'] else []
    if images:
        for image_url in images:
            if image_url.startswith('http'):
                display(Image(url=image_url, width=300))
    else:
        print("No image available")

Recipe: Low-Fat Berry Blue Frozen Dessert


Recipe: Biryani


Recipe: Best Lemonade


Recipe: Carina's Tofu-Vegetable Kebabs


Recipe: Cabbage Soup


Recipe: Best Blackbottom Pie
No image available
Recipe: Warm Chicken A La King


Recipe: Buttermilk Pie With Gingersnap Crumb Crust


Recipe: A Jad - Cucumber Pickle
No image available
Recipe: Butter Pecan Cookies


Recipe: Boston Cream Pie
No image available
Recipe: Chicken Breasts Lombardi


Recipe: Biscotti Di Prato


Recipe: Chai Tea
No image available
Recipe: Cafe Cappuccino


Recipe: Jimmy G's Carrot Cake
No image available
Recipe: Carrot Cake


Recipe: Betty Crocker's Southwestern Guacamole Dip


Recipe: Buttermilk Pie


Recipe: Black Bean Salsa
No image available


In [3]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 50)       # Show first 50 rows

# Display the first 50 rows for all columns
df_cleaned.head(50)

Unnamed: 0,RecipeId,Barcode,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,ReviewCount,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,*38*,Low-Fat Berry Blue Frozen Dessert,1533.0,Dancer,PT24H,PT45M,PT24H45M,1999-08-09T21:46:00Z,Make and share this Low-Fat Berry Blue Frozen ...,"[""https://img.sndimg.com/food/image/upload/w_5...",Frozen Desserts,"[""Dessert"", ""Low Protein"", ""Low Cholesterol"", ...","[""4"", ""1/4"", ""1"", ""1""]","[""blueberries"", ""granulated sugar"", ""vanilla y...",4.5,4.0,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[""Toss 2 cups berries with sugar."", ""Let stand..."
1,39.0,*39*,Biryani,1567.0,elly9812,PT25M,PT4H,PT4H25M,1999-08-29T13:12:00Z,Make and share this Biryani recipe from Food.com.,"[""https://img.sndimg.com/food/image/upload/w_5...",Chicken Breast,"[""Chicken Thigh & Leg"", ""Chicken"", ""Poultry"", ...","[""1"", ""4"", ""2"", ""2"", ""8"", ""1/4"", ""8"", ""1/2"", ""...","[""saffron"", ""milk"", ""hot green chili peppers"",...",3.0,1.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,"[""Soak saffron in warm milk for 5 minutes and ..."
2,40.0,*40*,Best Lemonade,1566.0,Stephen Little,PT5M,PT30M,PT35M,1999-09-05T19:52:00Z,This is from one of my first Good House Keepi...,"[""https://img.sndimg.com/food/image/upload/w_5...",Beverages,"[""Low Protein"", ""Low Cholesterol"", ""Healthy"", ...","[""1 1/2"", ""1"", ""1 1/2"", ""3/4""]","[""sugar"", ""lemons, rind of"", ""lemon, zest of"",...",4.5,10.0,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[""Into a 1 quart Jar with tight fitting lid, p..."
3,41.0,*41*,Carina's Tofu-Vegetable Kebabs,1586.0,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03T14:54:00Z,This dish is best prepared a day in advance to...,"[""https://img.sndimg.com/food/image/upload/w_5...",Soy/Tofu,"[""Beans"", ""Vegetable"", ""Low Cholesterol"", ""Wee...","[""12"", ""1"", ""2"", ""1"", ""10"", ""1"", ""3"", ""2"", ""2""...","[""extra firm tofu"", ""eggplant"", ""zucchini"", ""m...",4.5,2.0,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[""Drain the tofu, carefully squeezing out exce..."
4,42.0,*42*,Cabbage Soup,1538.0,Duckie067,PT30M,PT20M,PT50M,1999-09-19T06:19:00Z,Make and share this Cabbage Soup recipe from F...,"[""https://img.sndimg.com/food/image/upload/w_5...",Vegetable,"[""Low Protein"", ""Vegan"", ""Low Cholesterol"", ""H...","[""46"", ""4"", ""1"", ""2"", ""1""]","[""plain tomato juice"", ""cabbage"", ""onion"", ""ca...",4.5,11.0,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[""Mix everything together and bring to a boil...."
5,43.0,*43*,Best Blackbottom Pie,34879.0,Barefoot Beachcomber,PT2H,PT20M,PT2H20M,1999-08-21T10:35:00Z,Make and share this Best Blackbottom Pie recip...,,Pie,"[""Dessert"", ""Weeknight"", ""Stove Top"", ""< 4 Hou...","[""1 1/4"", ""1/4"", ""6"", ""1/3"", ""1/4"", ""1/4"", ""2""...","[""graham cracker crumbs"", ""sugar"", ""butter"", ""...",1.0,1.0,437.9,19.3,10.9,94.3,267.6,58.0,1.8,42.5,7.0,8.0,1 9-inch pie,"[""Graham Cracker Crust: In small bowl, combine..."
6,44.0,*44*,Warm Chicken A La King,1596.0,Joan Edington,PT3M,PT35M,PT38M,1999-09-17T04:47:00Z,I copied this one out of a friend's book so ma...,"[""https://img.sndimg.com/food/image/upload/w_5...",Chicken,"[""Poultry"", ""Meat"", ""< 60 Mins""]","[""12"", ""2"", ""3"", ""450"", ""1"", ""2"", ""1/4"", ""1"", ...","[""chicken"", ""butter"", ""flour"", ""milk"", ""celery...",5.0,23.0,895.5,66.8,31.9,405.8,557.2,29.1,3.1,5.0,45.3,2.0,,"[""Melt 1 1/2 ozs butter, add the flour and coo..."
7,45.0,*45*,Buttermilk Pie With Gingersnap Crumb Crust,1580.0,tristitia,PT50M,PT30M,PT1H20M,1999-08-06T00:40:00Z,Make and share this Buttermilk Pie With Ginger...,"[""https://img.sndimg.com/food/image/upload/w_5...",Pie,"[""Dessert"", ""Healthy"", ""Weeknight"", ""Oven"", ""<...","[""3/4"", ""1"", ""1"", ""2"", ""3"", ""1/4"", ""1"", ""1/2"",...","[""sugar"", ""margarine"", ""egg"", ""flour"", ""salt"",...",4.0,3.0,228.0,7.1,1.7,24.5,281.8,37.5,0.5,24.7,4.2,8.0,,"[""Preheat oven to 350\u00b0F."", ""Make pie crus..."
8,46.0,*46*,A Jad - Cucumber Pickle,1533.0,Dancer,,PT25M,PT25M,1999-08-11T19:48:00Z,Make and share this A Jad - Cucumber Pickle re...,,Vegetable,"[""Thai"", ""Asian"", ""Free Of..."", ""< 30 Mins""]","[""1/2"", ""5"", ""2"", ""1"", ""1"", ""1""]","[""rice vinegar"", ""haeo""]",5.0,2.0,4.3,0.0,0.0,0.0,0.7,1.1,0.2,0.2,0.1,,1 cup,"[""Slice the cucumber in four lengthwise, then ..."
9,47.0,*47*,Butter Pecan Cookies,1573.0,benluc,PT9M,PT55M,PT1H4M,1999-09-07T09:01:00Z,Make and share this Butter Pecan Cookies recip...,"[""https://img.sndimg.com/food/image/upload/w_5...",Dessert,"[""Cookie & Brownie"", ""Fruit"", ""Nuts"", ""Weeknig...","[""3/4"", ""1/2"", ""1"", ""1"", ""1"", ""2"", ""1""]","[""butter"", ""brown sugar"", ""granulated sugar"", ...",4.0,2.0,69.0,5.6,1.4,6.3,15.0,4.5,0.6,1.6,0.8,,84 cookies,"[""Preheat oven to 350 degrees."", ""Cream butter..."


In [4]:
# Function to store cleaned dataset in S3
def store_cleaned_dataset_to_s3(df, bucket_name, file_key):
    # Convert DataFrame to CSV string
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    
    # Create S3 client
    s3 = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION
    )
    
    # Upload CSV to S3
    s3.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())
    print(f"File successfully uploaded to s3://{bucket_name}/{file_key}")

# Define the S3 bucket and file key
bucket_name = 'recipe-dataset-sri'
file_key = 'recipes_data/cleaned_recipes_data.csv'

# Store the cleaned DataFrame to S3
store_cleaned_dataset_to_s3(df_cleaned, bucket_name, file_key)


File uploaded successfully to recipe-dataset-sri/recipes_data/cleaned_recipes_data.csv


In [5]:
import boto3
import pandas as pd
import json
import re
from io import StringIO
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# AWS Credentials (though not needed for this step, we'll keep them for future uploads)
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_REGION = os.getenv('AWS_REGION')
S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')

# Function to read dataset from S3
def read_dataset_from_s3(bucket_name, file_key):
    s3 = boto3.client(
        's3',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION
    )
    
    csv_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    body = csv_obj['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(body))
    return df

# Load dataset from S3
file_key = 'recipes_data/recipes_data.csv'  # Assuming this is the key for your uploaded dataset
df = read_dataset_from_s3(S3_BUCKET_NAME, file_key)

# Function to handle missing values
def handle_missing(value):
    if pd.isnull(value) or value == '' or value == 'NaN' or value == 'character(0)' or value == 'nan':
        return None
    return value

# Function to parse c("...") formatted lists and convert them to arrays
def parse_list_field(field_value):
    if isinstance(field_value, str):
        if field_value.startswith('c('):
            # Extract values within c("...") using a regular expression
            values = re.findall(r'"([^"]+)"', field_value)
            return [v.replace('\\"', '').strip() for v in values]  # Remove extra backslashes and quotes
        else:
            # Single value, wrap it in an array
            single_value = field_value.replace('\\"', '').replace('"', '').strip()  # Clean quotes
            return [single_value]  # Ensure it's wrapped in an array
    return []

# Function to clean all columns
def clean_all_columns(df):
    # Apply cleaning for all columns
    for column in df.columns:
        # First, handle missing values for each column
        df[column] = df[column].apply(lambda x: handle_missing(x))
        
        # Then, for columns that might need list handling (like arrays), we use the parse_list_field function
        df[column] = df[column].apply(lambda x: json.dumps(parse_list_field(x)) if isinstance(x, str) else x)
    
    return df

# Clean all columns
df_cleaned = clean_all_columns(df)

# Let's view the cleaned data (print the first few rows)
print(df_cleaned.head(50))

# Alternatively, display entire DataFrame to inspect
#pd.set_option('display.max_columns', None)  # Uncomment this to view all columns in the print statement
#pd.set_option('display.max_rows', None)     # Uncomment this to view all rows in the print statement


  df = pd.read_csv(StringIO(body))


    RecipeId   Barcode                                               Name  \
0       38.0  ["*38*"]              ["Low-Fat Berry Blue Frozen Dessert"]   
1       39.0  ["*39*"]                                        ["Biryani"]   
2       40.0  ["*40*"]                                  ["Best Lemonade"]   
3       41.0  ["*41*"]                 ["Carina's Tofu-Vegetable Kebabs"]   
4       42.0  ["*42*"]                                   ["Cabbage Soup"]   
5       43.0  ["*43*"]                           ["Best Blackbottom Pie"]   
6       44.0  ["*44*"]                         ["Warm Chicken A La King"]   
7       45.0  ["*45*"]     ["Buttermilk Pie With Gingersnap Crumb Crust"]   
8       46.0  ["*46*"]                        ["A Jad - Cucumber Pickle"]   
9       47.0  ["*47*"]                           ["Butter Pecan Cookies"]   
10      48.0  ["*48*"]                               ["Boston Cream Pie"]   
11      49.0  ["*49*"]                       ["Chicken Breasts Lombardi"]   