# Ingest

In this notebook, we read a wiki text snippet and save the data to an S3 bucket

In [1]:
import os
import warnings

from dotenv import load_dotenv, find_dotenv
import boto3
from datasets import load_dataset

warnings.filterwarnings("ignore")
load_dotenv(find_dotenv())

True

In [2]:
## Create a .env file on your local with the correct configs
s3_endpoint_url = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")

In [3]:
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

In [4]:
# ! jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [5]:
dataset = load_dataset('wikitext', 'wikitext-2-v1')

Reusing dataset wikitext (/Users/oindrillachatterjee/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


In [6]:
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [7]:
text = dataset['train']['text'][0:50]

In [8]:
text = list(filter(None, text))

In [9]:
text

[' = Valkyria Chronicles III = \n',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcom

## Upload to S3

In [10]:
file_path = 'data/raw/wiki.txt'
with open(file_path, 'w') as file:
    for item in text:
        file.write('%s\n' % item)

In [11]:
key = 'op1-pipelines/wiki.txt'
s3.upload_file(Bucket=s3_bucket, Key=key, Filename=str(file_path))