In [1]:
import pandas as pd

s3_uri = "s3://amazon-sagemaker-730335348097-us-east-1-028ed1706034/data/calories.csv"
df = pd.read_csv(s3_uri)
df.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


In [6]:
# 1. Init Git
!git init

Reinitialized existing Git repository in /home/sagemaker-user/src/mlops-calories/notebooks/.git/


In [7]:
# 2. Install DVC with S3 support (ignore the fsspec warning for now)
!pip install 'dvc[s3]' --quiet

In [9]:
# 3. Init DVC inside Git repo
!dvc init -f

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

In [12]:
# 4. Set up S3 as DVC remote
!dvc remote add -f -d myremote s3://amazon-sagemaker-730353348097-us-east-1-028ed1706034/dvcstore
!dvc remote modify myremote endpointurl https://s3.amazonaws.com

Setting 'myremote' as a default remote.
[0m[0m

### Create Dataset

In [13]:
# Create 80% subset
df_80 = df.iloc[:int(0.8 * len(df))]

# Save locally
df_80.to_csv("calories_v1.csv", index=False)
df.to_csv("calories_full.csv", index=False)

### Track with DVC

In [14]:
# This generates .dvc files (calories_v1.csv.dvc and calories_full.csv.dvc)
!dvc add calories_v1.csv
!dvc add calories_full.csv

[?25l[32m⠋[0m Checking graph
Adding...                                                                       
![A
Collecting files and computing hashes in calories_v1.csv |0.00 [00:00,     ?file[A
                                                                                [A
![A
  0% Checking cache in '/home/sagemaker-user/src/mlops-calories/notebooks/.dvc/c[A
                                                                                [A
![A
  0%|          |Adding calories_v1.csv to cache       0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/sagemaker-user/src/0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 12.54file/s][A

To track the changes with git, run:

	git add calories_v1.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true
[?25l[32m⠋[0m Checking graph
Adding...             

### Push to Remote Storage

✅ Created a new, valid S3 bucket (dvc-store-bucket)

✅ Configured .dvc/config with the correct remote URL

✅ Ran dvc push successfully — your data is now versioned in the cloud!

In [41]:
!dvc remote remove myremote --global || echo "already removed"
!dvc remote remove myremote --local || echo "already removed"

[0m[0m

In [42]:
!dvc remote add -f -d myremote s3://dvc-store-bucket --global
!dvc remote modify myremote endpointurl https://s3.amazonaws.com --global

Setting 'myremote' as a default remote.
[0m[0m

In [43]:
!cat ~/.config/dvc/config

[core]
    remote = myremote
['remote "myremote"']
    url = s3://dvc-store-bucket
    endpointurl = https://s3.amazonaws.com


In [44]:
!dvc push -v

[34m2025-08-05 22:05:12,726[39m [34mDEBUG[39m: v3.61.0 (pip), CPython 3.11.11 on Linux-5.10.238-231.953.amzn2.x86_64-x86_64-with-glibc2.35
[34m2025-08-05 22:05:12,727[39m [34mDEBUG[39m: command: /opt/conda/bin/dvc push -v
Collecting                                            |2.00 [00:00,  171entry/s]
[34m2025-08-05 22:05:13,092[39m [34mDEBUG[39m: Preparing to transfer data from '/home/sagemaker-user/src/mlops-calories/notebooks/.dvc/cache/files/md5' to 's3://dvc-store-bucket/files/md5'
[34m2025-08-05 22:05:13,092[39m [34mDEBUG[39m: Preparing to collect status from 'dvc-store-bucket/files/md5'
[34m2025-08-05 22:05:13,092[39m [34mDEBUG[39m: Collecting status from 'dvc-store-bucket/files/md5'
Pushing
![A
       ecking cache in 'dvc-store-bucket/files/md5'| |0/? [00:00<?,    ?files/s][A
[A[34m2025-08-05 22:05:13,515[39m [34mDEBUG[39m: Estimated remote size: 4096 files
Pushing
       ecking cache in 'dvc-store-bucket/files/md5'| |0/? [00:00<?,    ?files/s][A
[A

In [45]:
# DVC cache was successfully pushed to your new S3 bucket (dvc-store-bucket). The presence of hashed files like: 3f43e706cf11f89c8b966cea1d0317 under the files/md5/11/ path is exactly what DVC uses to store and version data blobs.

### Download versioned data from remote

In [47]:
!dvc pull
!dvc status

Collecting                                            |2.00 [00:00,  224entry/s]
Fetching
![A
  0% Checking cache in '/home/sagemaker-user/src/mlops-calories/notebooks/.dvc/c[A
Fetching                                                                        [A
Building workspace index                              |2.00 [00:00,  168entry/s]
Comparing indexes                                     |3.00 [00:00,  621entry/s]
Applying changes                                      |0.00 [00:00,     ?file/s]
Everything is up to date.
Data and pipelines are up to date.                                              
[0m

### Commit .dvc and .gitignore files

In [48]:
!git add *.dvc .gitignore
!git commit -m "Track data with DVC and configure remote storage"
!git push

[master (root-commit) 574f2bd] Track data with DVC and configure remote storage
 6 files changed, 18 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 calories_full.csv.dvc
 create mode 100644 calories_v1.csv.dvc
fatal: No configured push destination.
Either specify the URL from the command-line or configure a remote repository using

    git remote add <name> <url>

and then push using the remote name

    git push <name>



In [1]:
# Reset the remote to SSH
!git remote set-url origin git@github.com:luwenkai1997/mlops-calories.git