In [1]:
import json
import pandas as pd
import boto3
from io import StringIO
import s3_file_operations as s3_ops

In [68]:
bucket = "de-masterclass"  # S3 bucket name

print("Starting data transformation...")

# Read data from S3
print("Reading Character data from S3...")
characters_df = s3_ops.read_csv_from_s3(bucket, 'Rick&Morty/Untransformed/Character.csv')
print(f"Characters DataFrame shape: {characters_df.shape}")

# Check if data is loaded successfully
if characters_df is None:
    print("Error in loading data from S3")

print("Data loaded successfully from S3")

Starting data transformation...
Reading Character data from S3...
Characters DataFrame shape: (826, 12)
Data loaded successfully from S3


In [69]:
characters_df.head()

Unnamed: 0,id,name,status,species,type,gender,origin,location,image,episode,url,created
0,1,Rick Sanchez,Alive,Human,,Male,"{'name': 'Earth (C-137)', 'url': 'https://rick...","{'name': 'Citadel of Ricks', 'url': 'https://r...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/1', ...",https://rickandmortyapi.com/api/character/1,2017-11-04T18:48:46.250Z
1,2,Morty Smith,Alive,Human,,Male,"{'name': 'unknown', 'url': ''}","{'name': 'Citadel of Ricks', 'url': 'https://r...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/1', ...",https://rickandmortyapi.com/api/character/2,2017-11-04T18:50:21.651Z
2,3,Summer Smith,Alive,Human,,Female,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/3,2017-11-04T19:09:56.428Z
3,4,Beth Smith,Alive,Human,,Female,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/4,2017-11-04T19:22:43.665Z
4,5,Jerry Smith,Alive,Human,,Male,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/5,2017-11-04T19:26:56.301Z


In [70]:
import ast

origin_id_list = []
location_id_list = []

# Extracting location_id
for record in characters_df['origin']:
    # Parse the string into a dictionary if it's not already one
    if isinstance(record, str):
        record = ast.literal_eval(record)
    
    if record['url'] != '':
        origin_id = record['url'].split('/')[-1]  # Remove the unnecessary str() and just split
    else:
        origin_id = None

    origin_id_list.append(origin_id)
    
# extracting location_id
for record in characters_df['location']:
    # Parse the string into a dictionary if it's not already one
    if isinstance(record, str):
        record = ast.literal_eval(record)
    
    if record['url'] != '':
        location_id = record['url'].split('/')[-1]  # Remove the unnecessary str() and just split
    else:
        location_id = None
    location_id_list.append(location_id)

characters_df['origin_id'] = origin_id_list
characters_df['location_id'] = location_id_list


Alternatively.. we can use list comprehensions to solve for this

In [42]:
import ast

# Function to extract the ID from a URL
extract_id = lambda x: x.split('/')[-1] if x else None

# Using list comprehension to extract origin_id and location_id
characters_df['origin_id'] = [
    extract_id(ast.literal_eval(record)['url']) if isinstance(record, str) else None
    for record in characters_df['origin']
]

characters_df['location_id'] = [
    extract_id(ast.literal_eval(record)['url']) if isinstance(record, str) else None
    for record in characters_df['location']
]


In [45]:
# lets get the episodes count of each character

# Function to extract the ID from a URL
episode_count = lambda x: len(x) if x else None

# Using list comprehension to extract origin_id and location_id
characters_df['episode_count'] = [
    episode_count(ast.literal_eval(record)) if isinstance(record, str) else None
    for record in characters_df['episode']
]


In [47]:
# Drop and rename columns
print("Dropping and renaming columns...")
characters_df = characters_df.drop(columns=['origin', 'location', 'episode'])
characters_df = characters_df.rename(columns={'image': 'image_url'})

Dropping and renaming columns...


In [71]:
characters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 826 entries, 0 to 825
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           826 non-null    int64 
 1   name         826 non-null    object
 2   status       826 non-null    object
 3   species      826 non-null    object
 4   type         425 non-null    object
 5   gender       826 non-null    object
 6   origin       826 non-null    object
 7   location     826 non-null    object
 8   image        826 non-null    object
 9   episode      826 non-null    object
 10  url          826 non-null    object
 11  created      826 non-null    object
 12  origin_id    526 non-null    object
 13  location_id  805 non-null    object
dtypes: int64(1), object(13)
memory usage: 90.5+ KB


In [72]:
characters_df.head()

Unnamed: 0,id,name,status,species,type,gender,origin,location,image,episode,url,created,origin_id,location_id
0,1,Rick Sanchez,Alive,Human,,Male,"{'name': 'Earth (C-137)', 'url': 'https://rick...","{'name': 'Citadel of Ricks', 'url': 'https://r...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/1', ...",https://rickandmortyapi.com/api/character/1,2017-11-04T18:48:46.250Z,1.0,3
1,2,Morty Smith,Alive,Human,,Male,"{'name': 'unknown', 'url': ''}","{'name': 'Citadel of Ricks', 'url': 'https://r...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/1', ...",https://rickandmortyapi.com/api/character/2,2017-11-04T18:50:21.651Z,,3
2,3,Summer Smith,Alive,Human,,Female,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/3,2017-11-04T19:09:56.428Z,20.0,20
3,4,Beth Smith,Alive,Human,,Female,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/4,2017-11-04T19:22:43.665Z,20.0,20
4,5,Jerry Smith,Alive,Human,,Male,"{'name': 'Earth (Replacement Dimension)', 'url...","{'name': 'Earth (Replacement Dimension)', 'url...",https://rickandmortyapi.com/api/character/avat...,"['https://rickandmortyapi.com/api/episode/6', ...",https://rickandmortyapi.com/api/character/5,2017-11-04T19:26:56.301Z,20.0,20


In [50]:
# Read data from S3
print("Reading Episode data from S3...")
episodes_df = s3_ops.read_csv_from_s3(bucket, 'Rick&Morty/Untransformed/Episode.csv')
print(f"Episodes DataFrame shape: {episodes_df.shape}")

# Check if data is loaded successfully
if episodes_df is None:
    print("Error in loading data from S3")

print("Data loaded successfully from S3")

Reading Episode data from S3...
Episodes DataFrame shape: (51, 7)
Data loaded successfully from S3


In [56]:
episodes_df.head()

Unnamed: 0,id,name,air_date,episode,characters,url,created
0,1,Pilot,"December 2, 2013",S01E01,['https://rickandmortyapi.com/api/character/1'...,https://rickandmortyapi.com/api/episode/1,2017-11-10T12:56:33.798Z
1,2,Lawnmower Dog,"December 9, 2013",S01E02,['https://rickandmortyapi.com/api/character/1'...,https://rickandmortyapi.com/api/episode/2,2017-11-10T12:56:33.916Z
2,3,Anatomy Park,"December 16, 2013",S01E03,['https://rickandmortyapi.com/api/character/1'...,https://rickandmortyapi.com/api/episode/3,2017-11-10T12:56:34.022Z
3,4,M. Night Shaym-Aliens!,"January 13, 2014",S01E04,['https://rickandmortyapi.com/api/character/1'...,https://rickandmortyapi.com/api/episode/4,2017-11-10T12:56:34.129Z
4,5,Meeseeks and Destroy,"January 20, 2014",S01E05,['https://rickandmortyapi.com/api/character/1'...,https://rickandmortyapi.com/api/episode/5,2017-11-10T12:56:34.236Z


In [53]:
appearance_df = episodes_df.copy()

character_func = lambda x: [url.split('/')[-1] for url in x] if x else None

# Using list comprehension to extract origin_id and location_id
appearance_df['character_ids'] = [
    character_func(record) if record else None
    for record in appearance_df['characters']
]