# Processing

In this notebook, we read a wiki text snippet from an S3 bucket, process it and save the file abck on S3 to be imported into a Trino database and visualized on SuperSet

In [13]:
import os
import nltk
import warnings
from pathlib import Path

from dotenv import load_dotenv, find_dotenv
import boto3
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")
load_dotenv(find_dotenv())

True

In [2]:
## Create a .env file on your local with the correct configs
s3_endpoint_url = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")

In [3]:
# Create an S3 client
s3 = boto3.client(
    service_name="s3",
    aws_access_key_id=s3_access_key,
    aws_secret_access_key=s3_secret_key,
    endpoint_url=s3_endpoint_url,
)

## Read from S3

In [16]:
# Download from ceph and store in project's data directory
raw_destination_path = Path("data/raw")

processed_destination_path = Path("data/processed")

if not os.path.exists(processed_destination_path):
        processed_destination_path.mkdir(parents=True, exist_ok=True)

raw_file_path = raw_destination_path.joinpath('wiki.txt')
processed_file_path = processed_destination_path.joinpath('token.txt')

In [18]:
key = 'op1-pipelines/wiki.txt'
s3.download_file(Bucket=s3_bucket, Key=key, Filename=str(raw_file_path))

In [5]:
f = open(raw_file_path, "r")
text_corpus = f.read()

In [6]:
text_corpus

'The music was composed by Hitoshi Sakimoto , who had also worked on the previous Valkyria Chronicles games . When he originally heard about the project , he thought it would be a light tone similar to other Valkyria Chronicles games , but found the themes much darker than expected . An early theme he designed around his original vision of the project was rejected . He <unk> the main theme about seven times through the music production due to this need to <unk> the game . The main theme was initially recorded using orchestra , then Sakimoto removed elements such as the guitar and bass , then adjusted the theme using a synthesizer before <unk> segments such as the guitar piece on their own before incorporating them into the theme . The rejected main theme was used as a hopeful tune that played during the game \'s ending . The battle themes were designed around the concept of a " modern battle " divorced from a fantasy scenario by using modern musical instruments , constructed to create 

## Process Text

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text_corpus)
 
tokenized = [w for w in word_tokens if not w.lower() in stop_words]
tokenized[0:10]

['music',
 'composed',
 'Hitoshi',
 'Sakimoto',
 ',',
 'also',
 'worked',
 'previous',
 'Valkyria',
 'Chronicles']

In [9]:
with open(processed_file_path, 'w') as file:
    for item in tokenized:
        file.write('%s\n' % item)

## Upload to S3

In [10]:
key = 'op1-pipelines/processed/token.txt'
s3.upload_file(Bucket=s3_bucket, Key=key, Filename=str(processed_file_path))