In [3]:
import boto3
import pandas as pd
from ydata_profiling import ProfileReport
import plotly.express as px

In [4]:
def read_and_concatenate_files_from_s3(bucket_name, prefix):
    # Create an S3 client
    s3 = boto3.client('s3')

    # List to hold all DataFrames
    dataframes = []

    # Get the list of objects in the specific S3 path
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

    # Check if 'Contents' key is in the response (it's not present if the path is empty)
    if 'Contents' in response:
        for obj in response['Contents']:
            file_name = obj['Key']
            print(f"Reading file: {file_name}")

            # Only process CSV files, ignoring directories and non-CSV files
            if file_name.endswith('.csv') and "_SUCCESS" not in file_name:
                # Generate the S3 object URL
                object_url = f"s3://{bucket_name}/{file_name}"

                # Read the CSV file directly into a DataFrame
                df = pd.read_csv(object_url, sep='\t', on_bad_lines='skip')

                # Append the DataFrame to the list
                dataframes.append(df)

    # Concatenate all DataFrames in the list
    if dataframes:
        full_df = pd.concat(dataframes, ignore_index=True)
        print("Concatenated DataFrame:")
        print(full_df.head())  # Print the first few rows of the concatenated DataFrame
        return full_df
    else:
        print("No CSV files found.")
        return None

# Replace 'your-bucket-name' and 'your-prefix' with actual bucket name and prefix
full_dataframe = read_and_concatenate_files_from_s3('amazon-reviews-eafit', 'sample/')


Reading file: sample/_SUCCESS
Reading file: sample/part-00000-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00001-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00002-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00003-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00004-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00005-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00006-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00007-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00008-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00009-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00010-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00011-54e0d366-a37f-4ab3-b279-206f0c5a97bc-c000.csv
Reading file: sample/part-00012-54

In [5]:
full_dataframe.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,category
0,US,12006337,RVC5NDDNN93M8,B0145E526I,660424440,Superman Little Boys S/S Sublimation All Over ...,Apparel,5.0,0.0,0.0,N,N,Five Stars,Great fit super cute,2015-08-19,apparel
1,US,2396310,R30WDEU83V5AQO,B0144B8PSY,520474517,Amdirect Car Travel Seat Extended Mattress Inf...,Apparel,5.0,0.0,0.0,N,N,Five Stars,great for road trips!,2015-08-22,apparel
2,US,40793415,RARAEW7X41HYU,B013WQWPS8,746654558,M RACLE Women's Two Piece Crop Top + Midi Skir...,Apparel,5.0,0.0,0.0,N,N,A 5 Star,Beautiful and true to size but you could get i...,2015-08-27,apparel
3,US,466086,R1FWVBHBUSNMGF,B013W7ROD8,61634144,"\Funny Baby Onesies \""\""I Heart Mustaches\""\"" ...",Apparel,5.0,0.0,0.0,N,N,Wonderful!,This line is always coming out with great prod...,2015-08-14,apparel
4,US,5300783,RQYZGH7WLF73I,B013UCSCXG,278132176,Maggie Tang 50s 60s Vintage Short Sleeves Swin...,Apparel,5.0,0.0,0.0,N,N,I love it but i was very shocked that it didnt...,I love it but i was very shocked that it didnt...,2015-08-22,apparel


In [6]:
full_dataframe = full_dataframe.dropna(subset=['star_rating'])

In [7]:
full_dataframe['star_rating'] = pd.to_numeric(full_dataframe['star_rating'], errors='coerce', downcast='integer')

## Exportamos el DF

In [8]:
full_dataframe.to_parquet("../data/sample.gzip", index=None, compression='gzip')

## EDA

In [9]:
profile = ProfileReport(full_dataframe, title="Profiling Report")

In [10]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [22]:
df = full_dataframe.copy()
grouped = df.groupby("customer_id")[["review_body"]].count()
filtered = grouped[grouped["review_body"] > 1].sort_values(by="review_body", ascending=False)


In [24]:
filtered.head()

Unnamed: 0_level_0,review_body
customer_id,Unnamed: 1_level_1
50122160,223
50732546,103
50736950,91
18116317,77
52496677,69


In [36]:
fig = px.histogram(filtered, x=filtered.index,y="review_body", nbins=100)
fig.show()