In [7]:
import time
import pandas as pd
import pymongo.collection
import pymongo.errors
import pymongo.typings
import requests
import datetime
import pymongo
from typing import List
from bs4 import BeautifulSoup


def scrape_articles(article_urls: list, delay=1):
    articles = []
    print(enumerate(article_urls))
    for index, url in enumerate(article_urls):
        print(f"Fetching article {index+1} of {len(article_urls)}: {url}")

        article = scrape_article(url)
        articles.append(article)
        time.sleep(delay)

    return articles


def scrape_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.find(class_="article-title").text.strip()
    text = soup.find(class_="text").text.strip()
    last_updated = list(soup.find(class_="article-title").find_next_sibling().children)[
        3
    ].text
    author = soup.find(
        class_="article-meta-author-details-profile-display-name"
    ).text.strip()
    last_updated = datetime.datetime.strptime(last_updated, "%d %b, %Y").date()

    return {
        "title": title,
        "text": text,
        "last_updated": last_updated,
        "author": author,
        "url": url,
    }


def add_articles_to_db(articles: List[str], collection: pymongo.collection.Collection):
    for article in articles:
        try:
            collection.insert_one(article)
        except pymongo.errors.DuplicateKeyError:
            print("duplicate key error")
        except Exception as e:
            raise (e)

In [3]:
# fetch urls
PAGE_URL = "https://www.geeksforgeeks.org/docker-tutorial/"

page_soup = BeautifulSoup(requests.get(PAGE_URL).content, "html.parser")
urls = []

text_section = page_soup.find(class_="text")
a_tags = text_section.find_all("a")

for elem in a_tags:
    if elem.get('href') and elem.get('href').startswith("https://www.geeksforgeeks.org/"):
        urls.append(elem['href'])

len(urls)

71

In [4]:
df = pd.DataFrame(columns=["title","text", "url", "author", "last_updated"])


In [9]:
DELAY = 2

print(enumerate(urls))
for index, url in enumerate(urls):
    if url in df["url"].unique():
        continue
    print(f"Fetching article {index+1} of {len(urls)}: {url}")
    article = scrape_article(url)

    df.loc[len(df)] = article
    time.sleep(DELAY)

<enumerate object at 0x7f87b9598840>
Fetching article 47 of 71: https://www.geeksforgeeks.org/docker-container-linking/
Fetching article 48 of 71: https://www.geeksforgeeks.org/tips-to-manage-docker-containers-using-cli/
Fetching article 49 of 71: https://www.geeksforgeeks.org/mounting-a-volume-inside-docker-container/
Fetching article 50 of 71: https://www.geeksforgeeks.org/difference-between-docker-image-and-container/
Fetching article 51 of 71: https://www.geeksforgeeks.org/difference-between-virtual-machines-and-containers/
Fetching article 52 of 71: https://www.geeksforgeeks.org/how-to-install-linux-packages-inside-a-docker-container/
Fetching article 53 of 71: https://www.geeksforgeeks.org/copying-files-to-and-from-docker-containers/
Fetching article 54 of 71: https://www.geeksforgeeks.org/how-to-run-mongodb-as-a-docker-container/
Fetching article 55 of 71: https://www.geeksforgeeks.org/docker-docker-container-for-node-js/
Fetching article 56 of 71: https://www.geeksforgeeks.org/

In [40]:
df_old = pd.read_csv("articles.csv")
df_old = df_old.drop(columns=["Unnamed: 0"])


def clean_text(text):
    new_sample = []
    for elem in text.split("\n"):
        if elem != "":
            new_sample.append(elem)

    result = "\n".join(new_sample)
    return result


df["clean_text"] = df["text"].apply(clean_text)
df = df.set_index(pd.Index(list(range(len(df_old), len(df) + len(df_old)))))
df

Unnamed: 0,title,text,url,author,last_updated,clean_text
110,What is Docker?,Docker is a set of Platforms as a service (Paa...,https://www.geeksforgeeks.org/introduction-to-...,aaaanchakure,2024-04-05,Docker is a set of Platforms as a service (Paa...
111,Features of Docker,Pre-requisite: Docker\nDocker is one of the mo...,https://www.geeksforgeeks.org/features-of-docker/,msharma043510,2023-03-30,Pre-requisite: Docker\nDocker is one of the mo...
112,Architecture of Docker,Pre-requisite: Docker\nDocker makes use of a c...,https://www.geeksforgeeks.org/architecture-of-...,kushalpareek,2023-04-25,Pre-requisite: Docker\nDocker makes use of a c...
113,What is Docker Hub?,Pre-requisites: Docker\nDocker Hub is a reposi...,https://www.geeksforgeeks.org/what-is-docker-h...,mohdahtisham9889,2023-04-25,Pre-requisites: Docker\nDocker Hub is a reposi...
114,What is Docker Cloud?,Pre-requisite:- Docker\nDocker is a software p...,https://www.geeksforgeeks.org/what-is-docker-c...,sourabhsahu33,2023-03-30,Pre-requisite:- Docker\nDocker is a software p...
...,...,...,...,...,...,...
173,Docker – Deploying WebApps on Docker,Docker applies abstraction at the software lay...,https://www.geeksforgeeks.org/docker-deploying...,sarithak5br5,2024-04-23,Docker applies abstraction at the software lay...
174,Docker – Continuous Integration,Continuous Integration ( CI ) with Docker impr...,https://www.geeksforgeeks.org/docker-continuou...,arunbang17,2024-02-14,Continuous Integration ( CI ) with Docker impr...
175,Difference Between Vagrant and Docker,1. Vagrant: It is a tool for constructing and ...,https://www.geeksforgeeks.org/difference-betwe...,dikshamulchandani1,2023-01-03,1. Vagrant: It is a tool for constructing and ...
176,How to Setup Jenkins in Docker Container?,"In this article, we will look into the process...",https://www.geeksforgeeks.org/how-to-setup-jen...,prakashpratham89,2023-03-30,"In this article, we will look into the process..."


In [6]:
df_new = pd.concat([df_old, df])
df_new.to_csv('articles.csv')
len(df_new)

NameError: name 'pd' is not defined

In [10]:
result = pd.read_csv('articles.csv').drop(columns='Unnamed: 0')

In [11]:
# Data preparation

system_message = "You are a technical content writer. Your job is to write technical articles based on the title provided."

def create_user_message(row):
    return f"TITLE: {row['title']} \n\nARTICLE: "

def prepare_example_conversation(row):
    messages = []
    messages.append({"role":"system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role":"user", "content": user_message})

    messages.append({"role":"assistant", "content": row["clean_text"]})

    return {"messages": messages}


import pprint, tiktoken
pprint.pprint(prepare_example_conversation(result.loc[0]))



{'messages': [{'content': 'You are a technical content writer. Your job is to '
                          'write technical articles based on the title '
                          'provided.',
               'role': 'system'},
              {'content': 'TITLE: Introduction to Amazon Web Services \n'
                          '\n'
                          'ARTICLE: ',
               'role': 'user'},
              {'content': 'Amazon Web Services (AWS) is a leading top platform '
                          'in providing the web services of various domains. '
                          'AWS follows the trends of digital IT and comes up '
                          'needy services with optimized performances covering '
                          'a wide range of services from Compute to Storage. '
                          'It covers a wider range of customers of different '
                          'domains to expand their business operations. This '
                          'Article covers

In [12]:
training_df = pd.read_csv("articles.csv").drop(columns="Unnamed: 0")

training_df["training_data"] = training_df.apply(prepare_example_conversation, axis=1)
training_data = training_df.loc[0:135, "training_data"].tolist()

In [13]:
len(training_data)

136

In [14]:
validation_df = training_df.loc[135:]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()
len(validation_data)

43

In [15]:
import json

def write_jsonl(data_list: list, filename:str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict)+"\n"
            out.write(jout)

In [16]:
training_filename = "training_dataset.jsonl"
write_jsonl(training_data, training_filename)

validation_filename = "validation_dataset.jsonl"
write_jsonl(validation_data, validation_filename)

In [4]:
# Upload dataset
from openai import OpenAI
client = OpenAI()

training_filename = "training_dataset.jsonl"

with open(training_filename, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
     )
    
training_file_id = training_response.id

with open(validation_filename, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

KeyboardInterrupt: 

In [18]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo",
    suffix="tech-article"
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-1ieXGa9FmANp05qqVpmuIFbk
Status: validating_files


In [28]:
job_response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", job_response.id)
print("Status:", job_response.status)
print("Trained Tokens:", job_response.trained_tokens)

Job ID: ftjob-1ieXGa9FmANp05qqVpmuIFbk
Status: succeeded
Trained Tokens: 488907


In [27]:
# Check fine tune progress
finetune_progress_response = client.fine_tuning.jobs.list_events(job_id)

events = finetune_progress_response.data
events.reverse()

for event in events:
    print(event.message)

Step 393/408: training loss=1.66
Step 394/408: training loss=2.22
Step 395/408: training loss=1.44
Step 396/408: training loss=1.75
Step 397/408: training loss=2.44
Step 398/408: training loss=1.73
Step 399/408: training loss=1.38
Step 400/408: training loss=1.62
Step 401/408: training loss=1.75
Step 402/408: training loss=1.37
Step 403/408: training loss=1.28
Step 404/408: training loss=0.83
Step 405/408: training loss=1.60
Step 406/408: training loss=1.44
Step 407/408: training loss=1.52
Step 408/408: training loss=1.60
Checkpoint created at step 136 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:tech-article:9IWC5Dgm:ckpt-step-136
Checkpoint created at step 272 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:tech-article:9IWC6TaK:ckpt-step-272
New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:tech-article:9IWC6JMu
The job has successfully completed


In [29]:
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None: 
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:tech-article:9IWC6JMu


In [8]:
from openai import OpenAI
client = OpenAI()

fine_tuned_model_id = "ft:gpt-3.5-turbo-0125:personal:tech-article:9IWC6JMu"
instructions = """
1. Add the proper introduction of the main topic
2. Define all primary terminologies
3. Add the step-by-step process for better understanding [if required]
4. Add the proper diagrams and screenshots wherever it's required. Make sure examples/screenshots/theory should not be copied from any other resources as they may cause article rejection. Using Images from any external source is not allowed 
5. Explain the concept with proper examples.
6. Add at least 4 to 5 FAQs with proper answers. 
"""
context = """
Amazon S3 access points simplify data access for any AWS service or customer application that
stores data in S3. Access points are named network endpoints that are attached to buckets that
you can use to perform S3 object operations, such as GetObject and PutObject. Each access
point has distinct permissions and network controls that S3 applies for any request that is made
through that access point. Each access point enforces a customized access point policy that works
in conjunction with the bucket policy that is attached to the underlying bucket. You can conﬁgure
any access point to accept requests only from a virtual private cloud (VPC) to restrict Amazon S3
data access to a private network. You can also conﬁgure custom block public access settings for
each access point.

To create an access point
1.Sign in to the AWS Management Console and open the Amazon S3 console at https://
console.aws.amazon.com/s3/.
2.In the navigation bar on the top of the page, choose the name of the currently displayed AWS
Region. Next, choose the Region in which you want to create an access point.
3.In the left navigation pane, choose Access Points.
4.On the Access Points page, choose Create access point.
5.In the Access point name ﬁeld, enter the name for the access point. For more information
about naming access points, see Rules for naming Amazon S3 access points.
6.For Bucket name, specify the S3 bucket that you want to use with the access point.
To use a bucket in your account, choose Choose a bucket in this account, and enter or browse
for the bucket name.
To use a bucket in a diﬀerent AWS account, choose Specify a bucket in another account, and
enter the AWS account ID and name of the bucket.
7.Choose a Network origin. If you choose Virtual private cloud (VPC), enter the VPC ID that
you want to use with the access point.
For more information about network origins for access points, see Creating access points
restricted to a virtual private cloud.
8.Under Block Public Access settings for this Access Point, select the block public access
settings that you want to apply to the access point. All block public access settings are enabled
by default for new access points. We recommend that you keep all settings enabled unless you
know that you have a speciﬁc need to disable any of them.
For more information about using Amazon S3 Block Public Access with access points, see
Managing public access to access points.
9. (Optional) Under Access Point policy - optional, specify the access point policy. Before you
save your policy, make sure to resolve any security warnings, errors, general warnings, and
suggestions. For more information about specifying an access point policy, see Access point
policy examples.
10. Choose Create access point.
"""

article_title = "AWS S3 access point"
test_messages = [
    {
        "role": "system",
        "content": "You are a technical content writer. Your job is to write technical articles based on the title provided.",
    },
    {"role": "user", "content": f"TITLE: {article_title}\n\nINSTRUCTIONS: {instructions}\n\nCONTEXT: {context} \n\nARTICLE: "},
]

gpt_model = "gpt-3.5-turbo"
response = client.chat.completions.create(
    model=fine_tuned_model_id,messages=test_messages
)


In [10]:
from pprint import pprint
article = response.choices[0].message.content
print(article)
# len()

Amazon Simple Storage Service (Amazon S3) is one of the most popular services provided by Amazon AWS. It makes it easy for organizations to store large amounts of data or objects such as pictures, websites, or database backups. In large organizations or even for individual users, based on the nature of the data objects stored in S3 and the corresponding business rules, the access control that needs to be implemented on the objects can be very flexible.
AWS S3 access points simplify data access for any AWS service or customer application that stores data in S3.. Access points are named network endpoints that you can use to perform S3 object operations such as GetObject and PutObject. Each access point has unique permissions and network controls that S3 enforces each time you make a request through the access point. 
Use Case of S3 Access Points
Suppose you work for a global company working in various geographic regions. You can use S3 storage to store data in regional buckets. Let the s

In [45]:
print(result.loc[3].clean_text)

In a recent study by Verizon, 63% of the confirmed data breaches are due to either weak, stolen, or default passwords used. There is a saying in the cybersecurity world that goes like this “No matter how good your chain is it’s only as strong as your weakest link.” and exactly hackers use the weakest links in the organization to infiltrate. They usually use phishing attacks to infiltrate an organization and if they get at least one person to fall for it, it’s a serious turn of events from thereon. They use the stolen credentials to plant back doors, install malware or exfiltrate confidential data, all of which will cause serious losses for an organization.
How Identity and Access Management Works?
AWS(Amazon Web Services) will allows you to maintain the fine-grained permissions  to the AWS account and the services provided Amazon cloud. You can  manage the permissions to the individual users or you can manage the permissions to certain users as group and roles will helps you to manage 