In [1]:
import os
import glob
from tqdm import tqdm
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
import multiprocessing
from tqdm import tqdm
import pickle
import os
from bs4 import BeautifulSoup

In [2]:
def load_html_files(base_path, include_folders):
    all_files = []
    for folder in include_folders:
        folder_path = os.path.join(base_path, folder)
        for root, dirs, files in os.walk(folder_path):
            # Exclude directories starting with an underscore
            dirs[:] = [d for d in dirs if not d.startswith('_')]
            for file in files:
                if (file.endswith('.html')) and file != 'index.html':
                    all_files.append(os.path.join(root, file))
    return all_files

## Pandas

In [3]:
# extracting relevant file paths

In [4]:
pandas_file_path = "../documents/pandas_docs/"
pandas_include_folders = ['development', 'getting_started', 'reference/api', 'user_guide', 'whatsnew']
pandas_html_files = load_html_files(pandas_file_path, pandas_include_folders)

In [5]:
len(pandas_html_files)

2366

In [6]:
# c = 0
# for i in range(len(pandas_html_files)):
#     if 'user_guide/' in pandas_html_files[i]:
#         c+=1
#         print(pandas_html_files[i])

In [7]:
# Extracting content from html files

In [96]:
# def process_html_file(html_file):
#   loader = UnstructuredHTMLLoader(html_file)
#   return loader.load()

In [104]:
def process_html_file(html_file):
    loader = BSHTMLLoader(html_file, open_encoding="utf-8")
    data = loader.load()
    
    for doc in data:
        if 'soup' in doc.metadata:
            soup = doc.metadata['soup']
        else:
            with open(html_file, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file.read(), 'html.parser')
        
        target_main = soup.find('div', class_='bd-article-container')
        if target_main:
            # Remove 'admonition seealso' div, 'prev-next-footer' footer, and 'header-article-items header-article__inner' div
            for element in target_main.find_all(['div', 'footer'], class_=['admonition seealso', 'prev-next-footer', 'header-article-items header-article__inner']):
                element.decompose()

            # Extract and format the text content
            formatted_text = []
            seen_content = set()  # To keep track of unique content
            for element in target_main.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code', 'dt']):
                text = element.get_text(strip=True)
                if text and text not in seen_content:
                    if element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'dt']:
                        formatted_text.append(f'\n{text}\n')
                    elif element.name == 'pre':
                        formatted_text.append(f'\n{element.get_text()}\n')
                    elif element.name == 'code':
                        formatted_text.append(text)
                    seen_content.add(text)
            
            doc.page_content = ''.join(formatted_text).strip()
        else:
            doc.page_content = "No content found in the specified main tag."
    
    return data

def parallel_load(html_files):
  with multiprocessing.Pool() as pool:
    results = list(tqdm(pool.imap(process_html_file, html_files), total=len(html_files), desc="Loading files"))
    return [item for sublist in results for item in sublist]  # Flatten the results

In [105]:
pandas_data = parallel_load(pandas_html_files)

Loading files: 100%|██████████| 2366/2366 [00:07<00:00, 329.15it/s]


In [106]:
len(pandas_data)

2366

In [107]:
# modifying URL source

In [108]:
# Define the old and new base URLs
pandas_old_base_url = "../documents/pandas_docs"
pandas_new_base_url = "https://pandas.pydata.org/docs"

In [109]:
# Iterate over each document and update the source in the metadata
for doc in pandas_data:
    if 'source' in doc.metadata:
        doc.metadata['source'] = doc.metadata['source'].replace(pandas_old_base_url, pandas_new_base_url)

# Now, pandas_docs contains updated sources

In [135]:
print(pandas_data[265])

page_content='pandas.DataFrame.to_timestamp#

DataFrame.to_timestamp(freq=None,how='start',axis=0,copy=None)[source]#

Cast to DatetimeIndex of timestamps, atbeginningof period.

Parameters:

freqstr, default frequency of PeriodIndex

Desired frequency.

how{‘s’, ‘e’, ‘start’, ‘end’}

Convention for converting period to timestamp; start of period
vs. end.

axis{0 or ‘index’, 1 or ‘columns’}, default 0

The axis to convert (the index by default).

copybool, default True

If False then underlying input data is not copied.

Note

Thecopykeyword will change behavior in pandas 3.0.Copy-on-Writewill be enabled by default, which means that all methods with acopykeyword will use a lazy copy mechanism to defer the copy and
ignore thecopykeyword. Thecopykeyword will be removed in a
future version of pandas.

You can already get the future behavior and improvements through
enabling copy on writepd.options.mode.copy_on_write=True
pd.options.mode.copy_on_write=True
Returns:

DataFrame

The DataFram

In [136]:
print(pandas_data[1659])

page_content='pandas.tseries.offsets.CustomBusinessDay.is_month_start#

CustomBusinessDay.is_month_start(ts)#

Return boolean whether a timestamp occurs on the month start.

Examples

>>> ts = pd.Timestamp(2022, 1, 1)
>>> freq = pd.offsets.Hour(5)
>>> freq.is_month_start(ts)
True' metadata={'source': 'https://pandas.pydata.org/docs/reference/api/pandas.tseries.offsets.CustomBusinessDay.is_month_start.html', 'title': 'pandas.tseries.offsets.CustomBusinessDay.is_month_start — pandas 2.2.2 documentation'}


In [129]:
docs_with_no_content = []
for i in range(len(pandas_data)):
    if pandas_data[i].page_content == 'No content found in the specified main tag.':
        docs_with_no_content.append(i)
        
print(docs_with_no_content)
len(docs_with_no_content)

[13, 105, 126, 172, 244, 414, 435, 436, 634, 754, 884, 906, 908, 1149, 1162, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1321, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1467, 1468, 1510, 2264]


80

In [130]:
print(pandas_data[1361])

page_content='No content found in the specified main tag.' metadata={'source': 'https://pandas.pydata.org/docs/reference/api/pandas.core.window.Rolling.std.html', 'title': ''}


In [131]:
# Remove the documents with no content
# If pandas_data is a list:
pandas_data = [doc for i, doc in enumerate(pandas_data) if i not in docs_with_no_content]

In [132]:
len(pandas_data)

2286

In [33]:
# saving documents

In [137]:
# Define the path
path = '../documents/processed_docs/pandas_docs.pkl'

# Ensure the directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)

# Save data to a file
with open(path, 'wb') as file:
    pickle.dump(pandas_data, file)

## scikit-learn

In [3]:
import re

def process_html_file(html_file):
    loader = BSHTMLLoader(html_file, open_encoding="utf-8")
    data = loader.load()
    
    for doc in data:
        if 'soup' in doc.metadata:
            soup = doc.metadata['soup']
        else:
            with open(html_file, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file.read(), 'html.parser')
        
        target_main = soup.find('div', class_='bd-article-container')
        if target_main:
            # Remove 'admonition seealso' div, 'prev-next-footer' footer, and 'header-article-items header-article__inner' div
            for element in target_main.find_all(['div', 'footer', 'a', 'p'], 
                                                class_=['admonition seealso','footer-article-item',
                                                        'prev-next-footer',
                                                        'header-article-items header-article__inner',
                                                       'sphx-glr-timing']):
                element.decompose()

            # Extract and format the text content
            formatted_text = []
            seen_content = set()  # To keep track of unique content
            for element in target_main.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code', 'dt']):
                text = element.get_text(strip=True)
                if text and text not in seen_content:
                    if element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'dt']:
                        formatted_text.append(f'\n{text}\n')
                    elif element.name == 'pre':
                        formatted_text.append(f'\n{element.get_text()}\n')
                    elif element.name == 'code':
                        formatted_text.append(text)
                    seen_content.add(text)
            
            doc.page_content = ''.join(formatted_text).strip()
        else:
            doc.page_content = "No content found in the specified main tag."
    
    return data

def parallel_load(html_files):
  with multiprocessing.Pool() as pool:
    results = list(tqdm(pool.imap(process_html_file, html_files), total=len(html_files), desc="Loading files"))
    return [item for sublist in results for item in sublist]  # Flatten the results

In [4]:
scikit_learn_file_path = "../documents/scikit_learn_docs/"
scikit_learn_include_folders = ['auto_examples', 'computing', 
                               'datasets', 'modules', 'developers', 'whats_new', 'notebooks']
scikit_learn_html_files = load_html_files(scikit_learn_file_path, scikit_learn_include_folders)

In [5]:
len(scikit_learn_html_files)

948

In [6]:
# c = 0
# for i in range(len(scikit_learn_html_files)):
#     if 'index' in scikit_learn_html_files[i]:
#         c+=1
#         print(scikit_learn_html_files[i])

In [7]:
scikit_learn_data = parallel_load(scikit_learn_html_files)

Loading files: 100%|██████████| 948/948 [00:04<00:00, 227.93it/s]


In [8]:
len(scikit_learn_data)

948

In [10]:
# Define the old and new base URLs
scikit_learn_old_base_url = "../documents/scikit_learn_docs"
scikit_learn_new_base_url = "https://scikit-learn.org/stable"

In [11]:
# Iterate over each document and update the source in the metadata
for doc in scikit_learn_data:
    if 'source' in doc.metadata:
        doc.metadata['source'] = doc.metadata['source'].replace(scikit_learn_old_base_url, scikit_learn_new_base_url)

# Now, pandas_docs contains updated sources

In [319]:
print(scikit_learn_data[350])

page_content='6.6.Random Projection#

Thesklearn.random_projectionmodule implements a simple and
computationally efficient way to reduce the dimensionality of the data by
trading a controlled amount of accuracy (as additional variance) for faster
processing times and smaller model sizes. This module implements two types of
unstructured random matrix:Gaussian random matrixandsparse random matrix.
sklearn.random_projection
The dimensions and distribution of random projections matrices are
controlled so as to preserve the pairwise distances between any two
samples of the dataset. Thus random projection is a suitable approximation
technique for distance based method.

References

Sanjoy Dasgupta. 2000.Experiments with random projection.In Proceedings of the Sixteenth conference on Uncertainty in artificial
intelligence (UAI’00), Craig Boutilier and Moisés Goldszmidt (Eds.). Morgan
Kaufmann Publishers Inc., San Francisco, CA, USA, 143-151.

Ella Bingham and Heikki Mannila. 2001.Random proje

In [320]:
print(scikit_learn_data[900])

page_content='safe_mask#

sklearn.utils.safe_mask(X,mask)[source]#

Return a mask which is safe to use on X.

Parameters:

X{array-like, sparse matrix}

Data on which to apply mask.

maskarray-like

Mask to be used on X.

Returns:

maskndarray

Array that is safe to use on X.

Examples

>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
       [3],
       [5]])' metadata={'source': 'https://scikit-learn.org/stable/modules/generated/sklearn.utils.safe_mask.html', 'title': 'safe_mask — scikit-learn 1.5.1 documentation'}


In [321]:
print(scikit_learn_data[900])

page_content='safe_mask#

sklearn.utils.safe_mask(X,mask)[source]#

Return a mask which is safe to use on X.

Parameters:

X{array-like, sparse matrix}

Data on which to apply mask.

maskarray-like

Mask to be used on X.

Returns:

maskndarray

Array that is safe to use on X.

Examples

>>> from sklearn.utils import safe_mask
>>> from scipy.sparse import csr_matrix
>>> data = csr_matrix([[1], [2], [3], [4], [5]])
>>> condition = [False, True, True, False, True]
>>> mask = safe_mask(data, condition)
>>> data[mask].toarray()
array([[2],
       [3],
       [5]])' metadata={'source': 'https://scikit-learn.org/stable/modules/generated/sklearn.utils.safe_mask.html', 'title': 'safe_mask — scikit-learn 1.5.1 documentation'}


In [19]:
docs_with_no_content = []
for i in range(len(scikit_learn_data)):
    if scikit_learn_data[i].page_content == 'No content found in the specified main tag.'  or \
       'This document has been moved' in scikit_learn_data[i].page_content or \
        len(scikit_learn_data[i].page_content) < 200:
        docs_with_no_content.append(i)
        
print(docs_with_no_content)
len(docs_with_no_content)

[74, 81, 88, 115, 121, 144, 191, 213, 290, 313, 338, 347, 507, 514, 839, 896, 928]


17

In [25]:
docs_with_no_content = []
for i in range(len(scikit_learn_data)):
    if 'Normalizes' in scikit_learn_data[i].page_content:
        docs_with_no_content.append(i)
        
print(docs_with_no_content)
len(docs_with_no_content)

[654]


1

In [26]:
scikit_learn_data[654]

Document(metadata={'source': 'https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html', 'title': 'confusion_matrix — scikit-learn 1.5.1 documentation'}, page_content='confusion_matrix#\n\nsklearn.metrics.confusion_matrix(y_true,y_pred,*,labels=None,sample_weight=None,normalize=None)[source]#\n\nCompute confusion matrix to evaluate the accuracy of a classification.\n\nBy definition a confusion matrix\\(C\\)is such that\\(C_{i, j}\\)is equal to the number of observations known to be in group\\(i\\)and\npredicted to be in group\\(j\\).\n\nThus in binary classification, the count of true negatives is\\(C_{0,0}\\), false negatives is\\(C_{1,0}\\), true positives is\\(C_{1,1}\\)and false positives is\\(C_{0,1}\\).\n\nRead more in theUser Guide.\n\nParameters:\n\ny_truearray-like of shape (n_samples,)\n\nGround truth (correct) target values.\n\ny_predarray-like of shape (n_samples,)\n\nEstimated targets as returned by a classifier.\n\nlabelsarray-like of shape 

In [324]:
# Remove the documents with no content
# If pandas_data is a list:
scikit_learn_data = [doc for i, doc in enumerate(scikit_learn_data) if i not in docs_with_no_content]

In [325]:
len(scikit_learn_data)

937

In [326]:
# Define the path
path = '../documents/processed_docs/scikit_learn_docs.pkl'

# Ensure the directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)

# Save data to a file
with open(path, 'wb') as file:
    pickle.dump(scikit_learn_data, file)

## numpy

In [3]:
import re

def process_html_file(html_file):
    loader = BSHTMLLoader(html_file, open_encoding="utf-8")
    data = loader.load()
    
    for doc in data:
        if 'soup' in doc.metadata:
            soup = doc.metadata['soup']
        else:
            with open(html_file, 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file.read(), 'html.parser')
        
        target_main = soup.find('div', class_='bd-article-container')
        if target_main:
            # Remove 'admonition seealso' div, 'prev-next-footer' footer, and 'header-article-items header-article__inner' div
            for element in target_main.find_all(['div', 'footer', 'a', 'p'], 
                                                class_=['admonition seealso','footer-article-item',
                                                        'prev-next-footer',
                                                        'header-article-items header-article__inner',
                                                       'sphx-glr-timing']):
                element.decompose()

            # Extract and format the text content
            formatted_text = []
            seen_content = set()  # To keep track of unique content
            for element in target_main.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code', 'dt']):
                text = element.get_text(strip=True)
                if text and text not in seen_content:
                    if element.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'dt']:
                        formatted_text.append(f'\n{text}\n')
                    elif element.name == 'pre':
                        formatted_text.append(f'\n{element.get_text()}\n')
                    elif element.name == 'code':
                        formatted_text.append(text)
                    seen_content.add(text)
            
            doc.page_content = ''.join(formatted_text).strip()
        else:
            doc.page_content = "No content found in the specified main tag."
    
    return data

def parallel_load(html_files):
  with multiprocessing.Pool() as pool:
    results = list(tqdm(pool.imap(process_html_file, html_files), total=len(html_files), desc="Loading files"))
    return [item for sublist in results for item in sublist]  # Flatten the results

In [8]:
numpy_file_path = "../documents/numpy_docs/"
numpy_include_folders = ['user', 'reference', 
                               'building', 'dev', 'f2py', 'release']b
numpy_html_files = load_html_files(numpy_file_path, numpy_include_folders)
len(numpy_html_files)

2699

In [10]:
# c = 0
# for i in range(len(numpy_html_files)):
#     if 'index' in numpy_html_files[i]:
#         c+=1
#         print(numpy_html_files[i])

In [11]:
numpy_data = parallel_load(numpy_html_files)

Loading files: 100%|██████████| 2699/2699 [00:04<00:00, 659.01it/s]


In [12]:
len(numpy_html_files)

2699

In [13]:
# Define the old and new base URLs
numpy_old_base_url = "../documents/numpy_docs"
numpy_new_base_url = "https://numpy.org/doc/stable"

In [14]:
# Iterate over each document and update the source in the metadata
for doc in numpy_data:
    if 'source' in doc.metadata:
        doc.metadata['source'] = doc.metadata['source'].replace(numpy_old_base_url, numpy_new_base_url)

In [26]:
print(numpy_data[50])

page_content='NumPy C code explanations#

This document has been moved toNumPy C code explanations.' metadata={'source': 'https://numpy.org/doc/stable/reference/internals.code-explanations.html', 'title': 'NumPy C code explanations — NumPy v2.0 Manual'}


In [27]:
print(numpy_data[51])

page_content='NumPy internals#

This document has been moved toInternal organization of NumPy arrays.' metadata={'source': 'https://numpy.org/doc/stable/reference/internals.html', 'title': 'NumPy internals — NumPy v2.0 Manual'}


In [29]:
docs_with_no_content = []
for i in range(len(numpy_data)):
    if numpy_data[i].page_content == 'No content found in the specified main tag.':
        docs_with_no_content.append(i)
        
print(docs_with_no_content)
len(docs_with_no_content)

[]


0

In [48]:
docs_with_no_content = []
for i in range(len(numpy_data)):
    if numpy_data[i].page_content == 'No content found in the specified main tag.' or \
       'This document has been moved' in numpy_data[i].page_content or \
        len(numpy_data[i].page_content) < 100:
        docs_with_no_content.append(i)

print(docs_with_no_content)
print(len(docs_with_no_content))


[27, 31, 34, 50, 51, 232, 234, 260, 272, 276, 309, 470, 475, 477, 478, 487, 488, 489, 490, 491, 492, 514, 516, 532, 602, 603, 605, 606, 609, 610, 611, 612, 613, 614, 615, 616, 618, 619, 621, 645, 646, 697, 715, 716, 722, 728, 729, 730, 732, 733, 792, 797, 799, 803, 804, 807, 808, 810, 811, 815, 816, 817, 820, 823, 831, 832, 840, 849, 851, 853, 855, 859, 863, 865, 868, 870, 874, 876, 877, 882, 883, 884, 885, 886, 887, 891, 894, 900, 902, 905, 908, 912, 915, 969, 1108, 1126, 1131, 1134, 1140, 1155, 1170, 1234, 1279, 1281, 1298, 1306, 1330, 1365, 1367, 1380, 1388, 1392, 1412, 1447, 1448, 1449, 1452, 1454, 1455, 1458, 1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480, 1481, 1482, 1483, 1484, 1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494, 1495, 1497, 1498, 1499, 1500, 1520, 1522, 1534, 1542, 1546, 1566, 1582, 1584, 1585, 1586, 1587, 1589, 1590, 1591, 1593, 1594, 1595, 1596, 1597, 1598, 1602, 1603, 

In [50]:
# Remove the documents with no content
# If pandas_data is a list:
numpy_data = [doc for i, doc in enumerate(numpy_data) if i not in docs_with_no_content]

In [51]:
len(numpy_data)

2464

In [52]:
# Define the path
path = '../documents/processed_docs/numpy_docs.pkl'

# Ensure the directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)

# Save data to a file
with open(path, 'wb') as file:
    pickle.dump(numpy_data, file)

In [53]:
print(numpy_data[542])

page_content='numpy.finfo#

classnumpy.finfo(dtype)[source]#

Machine limits for floating point types.

Parameters:

dtypefloat, dtype, or instance

Kind of floating point or complex floating point
data-type about which to get information.

Notes

For developers of NumPy: do not instantiate this at the module level.
The initial calculation of these parameters is expensive and negatively
impacts import times.  These objects are cached, so callingfinfo()repeatedly inside your functions is not a problem.
finfo()
Note thatsmallest_normalis not actually the smallest positive
representable value in a NumPy floating point type. As in the IEEE-754
standard[1], NumPy floating point types make use of subnormal numbers to
fill the gap between 0 andsmallest_normal. However, subnormal numbers
may have significantly reduced precision[2].
smallest_normal
This function can also be used for complex data types as well. If used,
the output will be the same as the corresponding real float type
(e.g. numpy