In [None]:
import boto3
import pandas as pd
from io import StringIO
from io import BytesIO

def read_csv_from_s3(bucket_name, object_key):
    
    # Create an S3 client
    s3_client = boto3.client('s3')
    
    # Create a buffer
    csv_buffer = BytesIO()
    
    # Get object from S3 and write it to buffer
    s3_client.download_fileobj(bucket_name, object_key, csv_buffer)
    
    # Set the buffer's position to the start
    csv_buffer.seek(0)
    
    # Read buffer into DataFrame
    df = pd.read_csv(csv_buffer)
    
    return df

def lambda_handler(event, context):
  # Define the key of the file you want to process
    datasource1 = 'movies.csv'
    datasource2 = 'api_movie_data.csv'
    
    # Define your source and destination S3 bucket names
    source_bucket = 'final-project-rawdata-group-5-sec-2'
    destination_bucket = 'final-project-mergedata-group-5-sec-2'
    
    # Initialize S3 client
    s3 = boto3.client('s3')
    
    # Convert the content to pandas DataFrames
    csv_movie_df = read_csv_from_s3(source_bucket, datasource1)
    api_movie_df = read_csv_from_s3(source_bucket, datasource2)
    
    # CSV movie Data Handling
    # Split 'title' column into 'title' and 'year'
    csv_movie_df['year'] = csv_movie_df['title'].str.extract(r'\((\d{4})\)$')  # Extract year enclosed in parentheses at the end of the string
    csv_movie_df['title'] = csv_movie_df['title'].str.replace(r'\s\(\d{4}\)$', '', regex=True)  # Remove the year from the 'title' column

    # Data Merging
    merged_df = pd.merge(csv_movie_df, api_movie_df, on='title', how='inner')
        
    # Select columns
    columns_to_select = ['imdb_id', 'title', 'language', 'country', 'adult',
                         'genres_x', 'budget', 'revenue', 'release_date', 
                         'runtime', 'popularity', 'avg_rating', 'people_rated']
    
    # Selecting the columns from dataset
    selected_df = merged_df[columns_to_select]
    
    # Drop Duplicates
    selected_df = selected_df.drop_duplicates(subset='imdb_id', keep='last')
    
    # Renaming the columns
    selected_df = selected_df.rename(columns={
        'genres_x': 'genres', 
    })
    
    # Convert the transformed DataFrame back to CSV format
    transformed_csv = selected_df.to_csv(index=False)
        
    # Define the destination key where you want to store the transformed data
    destination_key = 'final_movie_dataset.csv'

    # Upload the transformed data to the destination S3 bucket
    s3.put_object(Body=transformed_csv, Bucket=destination_bucket, Key=destination_key)

    return {
        'statusCode': 200,
        'body': 'Transformation completed successfully!'
    }