In [1]:
from langchain.document_loaders import TextLoader

In [3]:
loader = TextLoader("nvda_news_1.txt")
data = loader.load()

In [4]:
data[0].page_content



In [5]:
data[0].metadata

{'source': 'nvda_news_1.txt'}

In [6]:
from langchain.document_loaders import CSVLoader

In [7]:
loader = CSVLoader("movies.csv")
data = loader.load()
len(data)

9

In [8]:
data[0]

Document(page_content='movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR', metadata={'source': 'movies.csv', 'row': 0})

In [9]:
type(data[0])

langchain.schema.document.Document

In [10]:
data[0].page_content

'movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR'

In [11]:
data[0].metadata

{'source': 'movies.csv', 'row': 0}

In [12]:
# Specify the source column

loader = CSVLoader("movies.csv", source_column="title")
data = loader.load()
len(data)

9

In [13]:
data[0].metadata

{'source': 'K.G.F: Chapter 2', 'row': 0}

In [14]:
data

[Document(page_content='movie_id: 101\ntitle: K.G.F: Chapter 2\nindustry: Bollywood\nrelease_year: 2022\nimdb_rating: 8.4\nstudio: Hombale Films\nlanguage_id: 3\nbudget: 1\nrevenue: 12.5\nunit: Billions\ncurrency: INR', metadata={'source': 'K.G.F: Chapter 2', 'row': 0}),
 Document(page_content='movie_id: 102\ntitle: Doctor Strange in the Multiverse of Madness\nindustry: Hollywood\nrelease_year: 2022\nimdb_rating: 7\nstudio: Marvel Studios\nlanguage_id: 5\nbudget: 200\nrevenue: 954.8\nunit: Millions\ncurrency: USD', metadata={'source': 'Doctor Strange in the Multiverse of Madness', 'row': 1}),
 Document(page_content='movie_id: 103\ntitle: Thor: The Dark World\nindustry: Hollywood\nrelease_year: 2013\nimdb_rating: 6.8\nstudio: Marvel Studios\nlanguage_id: 5\nbudget: 165\nrevenue: 644.8\nunit: Millions\ncurrency: USD', metadata={'source': 'Thor: The Dark World', 'row': 2}),
 Document(page_content='movie_id: 104\ntitle: Thor: Ragnarok\nindustry: Hollywood\nrelease_year: 2017\nimdb_rating: 

### Unstructured URL Loader
#### Take a news article and load the text content from the URL

In [22]:
from langchain.document_loaders import UnstructuredURLLoader

In [23]:
loader = UnstructuredURLLoader(
    urls=[
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
])

In [24]:
data = loader.load()
len(data)

libmagic is unavailable but assists in filetype detection on file-like objects. Please consider installing libmagic for better results.
Error fetching or processing https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html, exception: Invalid file. The FileType.UNK file type is not supported in partition.
libmagic is unavailable but assists in filetype detection on file-like objects. Please consider installing libmagic for better results.
Error fetching or processing https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html, exception: Invalid file. The FileType.UNK file type is not supported in partition.


0

In [4]:
data[0].metadata

IndexError: list index out of range

### Test Splitters
#### Reduce text to smaller chunks
LLM's have token limits. Hence we need to split the text which can be large into small chunks so that each chunk size is under the token limit. There are various text splitter classes in langchain that allows us to do this.

In [13]:
sample_text = "Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. \nIt stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. \nSet in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.\n\nBrothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg. \nKip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. \nCinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. \nInterstellar uses extensive practical and miniature effects, and the company Double Negative created additional digital effects.\n\nInterstellar premiered in Los Angeles on October 26, 2014. In the United States, it was first released on film stock, expanding to venues using digital projectors. The film received generally positive reviews from critics and grossed over $677 million worldwide ($715 million after subsequent re-releases), making it the tenth-highest-grossing film of 2014. \nIt has been praised by astronomers for its scientific accuracy and portrayal of theoretical astrophysics.[5][6][7] Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades."

In [9]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

In [14]:
chunks = splitter.split_text(sample_text)
len(chunks)

Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 358, which is longer than the specified 200


9

In [15]:
chunks

['Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.',
 'It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.',
 'Set in a dystopian future where humanity is embroiled in a catastrophic blight and famine, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for humankind.',
 'Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007 and was originally set to be directed by Steven Spielberg.',
 'Kip Thorne, a Caltech theoretical physicist and 2017 Nobel laureate in Physics,[4] was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.',
 'Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. Principal photography began in late 2013 and t

In [16]:
for chunk in chunks:
    print(len(chunk))

105
120
210
181
197
207
128
357
253


### Recursive Text Splitter

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

r_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " "],
    chunk_size=200,
    chunk_overlap=0
)

In [19]:
chunks = r_splitter.split_text(sample_text)
len(chunks)

13

In [21]:
for chunk in chunks:
    print(len(chunk))

105
120
199
10
181
197
198
8
128
191
165
198
54
