In [25]:
# !pip install pandas
# %pip install -qU langchain_community beautifulsoup4
# %pip install tiktoken
# %pip install sentence-transformers
# %pip install langchain-experimental
#%pip install pinecone

In [26]:
import os
import pandas as pd 
import ast
from langchain_community.document_loaders import WebBaseLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec


In [27]:
load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

## Load Sample Data

In [28]:
sample_data_df = pd.read_csv("../Resources/sample_data - itg_sports.csv.csv")
sample_data_df.head(5)

Unnamed: 0,pageURL,title,publishedDate,author,tags
0,https://www.indiatoday.in/sports/cricket/story...,"IPL 2025: Ruthless MI top table, knock Rajasth...",2025-05-01T23:12:07.000000Z,Jane Doe,"[""#IPL2025"", ""#MumbaiIndians"", ""#RajasthanRoya..."
1,https://www.indiatoday.in/sports/cricket/story...,Shubman Gill likely to play vs SRH despite bac...,2025-05-01T22:30:24.000000Z,Jane Doe,"[""#ShubmanGill"", ""#GujaratTitans"", ""#IPLInjur..."
2,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Vaibhav Suryavanshi out for 2-ball d...,2025-05-01T21:53:11.000000Z,Jane Doe,"[""#VaibhavSuryavanshi"", ""#IPL2025"", ""#CricketF..."
3,https://www.indiatoday.in/sports/cricket/story...,"IPL: Rohit Sharma completes 6,000 runs for MI,...",2025-05-01T21:13:15.000000Z,Jane Doe,"[""#RohitSharma"", ""#MumbaiIndians"", ""#IPLRecords""]"
4,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Don't praise Vaibhav Suryavanshi to ...,2025-05-01T20:36:03.000000Z,Jane Doe,"[""#VaibhavSuryavanshi"", ""#SunilGavaskar"", ""#IP..."


## Clean Sample Data

In [29]:
# prepare publish_date, publish_year, publish_month

sample_data_df['publishedDate'] = pd.to_datetime(sample_data_df['publishedDate'])
sample_data_df['publish_year'] = sample_data_df['publishedDate'].dt.year
sample_data_df['publish_month'] = sample_data_df['publishedDate'].dt.month
sample_data_df['publish_date'] = sample_data_df['publishedDate'].dt.day

#Clean Tags
sample_data_df['tags'] = sample_data_df['tags'].apply(lambda x : [i.lstrip("#").replace(" ","").lower().strip() for i in ast.literal_eval(x.strip())])
sample_data_df.head()

#Export unique tags



Unnamed: 0,pageURL,title,publishedDate,author,tags,publish_year,publish_month,publish_date
0,https://www.indiatoday.in/sports/cricket/story...,"IPL 2025: Ruthless MI top table, knock Rajasth...",2025-05-01 23:12:07+00:00,Jane Doe,"[ipl2025, mumbaiindians, rajasthanroyals]",2025,5,1
1,https://www.indiatoday.in/sports/cricket/story...,Shubman Gill likely to play vs SRH despite bac...,2025-05-01 22:30:24+00:00,Jane Doe,"[shubmangill, gujarattitans, iplinjuries]",2025,5,1
2,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Vaibhav Suryavanshi out for 2-ball d...,2025-05-01 21:53:11+00:00,Jane Doe,"[vaibhavsuryavanshi, ipl2025, cricketform]",2025,5,1
3,https://www.indiatoday.in/sports/cricket/story...,"IPL: Rohit Sharma completes 6,000 runs for MI,...",2025-05-01 21:13:15+00:00,Jane Doe,"[rohitsharma, mumbaiindians, iplrecords]",2025,5,1
4,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Don't praise Vaibhav Suryavanshi to ...,2025-05-01 20:36:03+00:00,Jane Doe,"[vaibhavsuryavanshi, sunilgavaskar, ipl2025]",2025,5,1


## Load Data From URL

In [30]:
sample_urls = sample_data_df['pageURL'].unique().tolist()
loader = WebBaseLoader(sample_urls)
sample_data_df['pageContent'] = [l.page_content for l in loader.load()]
sample_data_df.head()

Unnamed: 0,pageURL,title,publishedDate,author,tags,publish_year,publish_month,publish_date,pageContent
0,https://www.indiatoday.in/sports/cricket/story...,"IPL 2025: Ruthless MI top table, knock Rajasth...",2025-05-01 23:12:07+00:00,Jane Doe,"[ipl2025, mumbaiindians, rajasthanroyals]",2025,5,1,"IPL 2025: Ruthless MI top table, knock Rajasth..."
1,https://www.indiatoday.in/sports/cricket/story...,Shubman Gill likely to play vs SRH despite bac...,2025-05-01 22:30:24+00:00,Jane Doe,"[shubmangill, gujarattitans, iplinjuries]",2025,5,1,Shubman Gill likely to play vs SRH despite bac...
2,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Vaibhav Suryavanshi out for 2-ball d...,2025-05-01 21:53:11+00:00,Jane Doe,"[vaibhavsuryavanshi, ipl2025, cricketform]",2025,5,1,IPL 2025 RR vs MI: Vaibhav Suryavanshi out for...
3,https://www.indiatoday.in/sports/cricket/story...,"IPL: Rohit Sharma completes 6,000 runs for MI,...",2025-05-01 21:13:15+00:00,Jane Doe,"[rohitsharma, mumbaiindians, iplrecords]",2025,5,1,"IPL: Rohit Sharma completes 6,000 runs for MI,..."
4,https://www.indiatoday.in/sports/cricket/story...,IPL 2025: Don't praise Vaibhav Suryavanshi to ...,2025-05-01 20:36:03+00:00,Jane Doe,"[vaibhavsuryavanshi, sunilgavaskar, ipl2025]",2025,5,1,IPL 2025: Don't praise Vaibhav Suryavanshi to ...


## Chunk Data and Prepare upsert_records for pinecone

In [31]:
# performing semantic chunking
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
chunker = SemanticChunker(
    embeddings=embeddings,breakpoint_threshold_type="gradient")

In [32]:

recordes = []
for i,row in sample_data_df.iterrows():
    document_url = row['pageURL']
    tags = row['tags']
    published_year = row['publish_year']
    published_month = row['publish_month']
    published_day = row['publish_date']
    author = row['author']
    page_url = row['pageURL'].strip()

    document_content = row['pageContent'].replace('\xa0', '')
    chunks = chunker.split_text(document_content)
    for chunk_num,chunk in enumerate(chunks):
        rec = dict()
        rec['_id'] = f"rec{i}_chunk{chunk_num}"
        rec['text'] = chunk
        rec['tags'] = tags
        rec['published_year'] = int(published_year)
        rec['published_month'] = int(published_month)
        rec['published_day'] = int(published_day)
        rec['author'] = author
        rec['page_url'] = page_url
        recordes.append(rec)
recordes

[{'_id': 'rec0_chunk0',
  'text': "IPL 2025: Ruthless MI top table, knock Rajasthan Royals out of playoffs race - India Today India TodayAaj TakGNTTVLallantopBusiness TodayBanglaMalayalamNortheastBT BazaarHarper's BazaarSports TakCrime TakAstro TakGamingBrides TodayCosmopolitanKisan TakIshq FMIndia Today HindiReader’s DigestIndia TodayAaj TakGNTTVLallantopBusiness TodayBanglaMalayalamNortheastBT BazaarHarper's BazaarSports TakMagazineLive TVSearchSEARCHSIGN INEdition ININUSHome TVLive TVPrimetimeMagazineLatest EditionInsightBest CollegesLife+StyleIndiaSouthGlobalAll World NewsUS NewsCanada NewsUK NewsChina NewsIndians AbroadBusinessAll SportsIPL 2025TennisCricketFootballSports TodayTechnologyShowbuzzEntertainmentBollywoodHollywoodTelevisionOTTLatest ReviewsNewspresso SpecialsPodcastsFirst Things FastSunday SpecialHistory of ItNewsMoDIUInteractivesOpinionGamesVideosShort VideosFact Check Other NewsEducationIt's ViralScienceHealthAutoLaw TodayEnvironmentCitiesWeatherWeb StoriesHoroscopes

## Upsert Data in pineconde index

In [None]:
index_name = "bridged-demo"
namespace = "sample-data"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "text"}
        }
    )

desc = pc.describe_index(index_name)
host_url = desc['host']  # or desc.host depending on the SDK version
index = pc.Index(host=host_url)

##uncomment below line to upsert records
index.upsert_records(namespace,recordes) 

