In [4]:
import os
import subprocess
from libgen.scraper import Scraper

# Function to convert files to PDF using Calibre
def convert_to_pdf(input_file, output_pdf):
    print(f"Converting {input_file} to {output_pdf}...")
    extension = input_file.split('.')[-1].lower()
    if extension in ['mobi', 'epub', 'djvu', 'docx']:
        command = f"/Applications/calibre.app/Contents/MacOS/ebook-convert '{input_file}' '{output_pdf}'"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            print(f"Converted {input_file} to PDF as {output_pdf}")
        else:
            print(f"Error in conversion: {result.stderr.strip()}")
    else:
        print(f"No conversion required for {input_file}, already in PDF format.")
    return output_pdf

# Function to download and convert the book if necessary
def download_book(book, output_dir, downloaded_titles):
    os.makedirs(output_dir, exist_ok=True)

    output_file = os.path.join(output_dir, f"{book['name']}.{book['format'].lower()}")
    
    if book['name'] in downloaded_titles:
        print(f"Skipping already downloaded book: {book['name']}")
        return

    print(f"Downloading: {book['name']}")

    download_successful = scraper.download(book['link'], output_path=output_file)
    
    if download_successful:
        print(f"Downloaded: {output_file}")
        if book['format'].lower() != "pdf":
            output_pdf = os.path.join(output_dir, f"{book['name']}.pdf")
            convert_to_pdf(output_file, output_pdf)
        downloaded_titles.add(book['name'])
    else:
        print(f"Download failed for: {book['name']}")

# Function to filter and prioritize books
def filter_books(books):
    unique_books = {}
    
    for book in books:
        title = book['name']
        if title not in unique_books:
            unique_books[title] = book
        else:
            existing_book = unique_books[title]
            if (book['year'] > existing_book['year'] and book['format'].lower() == 'pdf') or \
               (book['format'].lower() == 'pdf' and existing_book['format'].lower() != 'pdf'):
                unique_books[title] = book

    return list(unique_books.values())

# Initialize the scraper
scraper = Scraper()

# Search for books based on user input
query = input("Enter the book title or query: ")
books = scraper.get_data(query)

# Filter the books
filtered_books = filter_books(books)

# Display the filtered books with an index for user selection
for index, book in enumerate(filtered_books):
    print(f"{index + 1}. {book['name']} ({book['year']}) - {book['format']}")

# Ask user to select the books they want to download
selection = input("Enter the numbers of the books to download (comma-separated, e.g., 1,3,5): ")
selected_indices = [int(x.strip()) - 1 for x in selection.split(',')]

# Validate the selected indices
selected_books = [filtered_books[i] for i in selected_indices if 0 <= i < len(filtered_books)]

# Download the selected books
downloaded_titles = set()
for book in selected_books:
    download_book(book, output_dir="downloads", downloaded_titles=downloaded_titles)

print("All selected books have been processed.")


1. Hands-On Machine Learning with ML.NET: Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781 (2020) - pdf
2. Hands-On Machine Learning with TensorFlow.js: A guide to building ML applications integrated with web technology using the TensorFlow.js library 9781838821739, 1838821732 (2019) - pdf
3. Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146 (2019) - epub
4. Hands-On ML Projects with OpenCV: Master Computer Vision and Machine Learning Using OpenCV and Python 9789388590877 (2023) - epub
5. GoLang for Machine Learning: A Hands-on-Guide to Building Efficient, Smart and Scalable ML Models with Go Programming (2024) - pdf
Downloading: Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146


Downloading: downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.epub:   0%|          | 18.3k/18.3M [10:19<176:06:03, 30.2B/s]


Downloaded: downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.epub
Converting downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.epub to downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.pdf...
Converted downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.epub to PDF as downloads/Hands-On Artificial Intelligence on Amazon Web Services: Decrease the time to market for AI and ML applications with the power of AWS 9781789531473, 9781789534146.pdf
All selected books have been processed.


Multithreading or processing

In [10]:
import requests
import csv
import time  # Import time for measuring elapsed time
from bs4 import BeautifulSoup

# Function to handle pagination manually and collect book metadata
def get_books_with_pagination(query, max_pages=10):
    all_books = []
    for page in range(1, max_pages + 1):
        print(f"Fetching page {page}...")
        base_url = f"http://libgen.is/search.php?req={query}&res=25&page={page}"
        response = requests.get(base_url)
        response.raise_for_status()  # Check for a successful response

        soup = BeautifulSoup(response.content, 'html.parser')
        # Assuming the search results are in the third table of the page
        table = soup.find_all('table')[2]  
        rows = table.find_all('tr')[1:]  # Skip the header row

        if not rows:
            print(f"No more books found on page {page}. Stopping.")
            break

        for row in rows:
            columns = row.find_all('td')
            if len(columns) >= 5:
                name = columns[2].get_text(strip=True)
                author = columns[1].get_text(strip=True)
                year = columns[4].get_text(strip=True)
                file_type = columns[3].get_text(strip=True)
                link = columns[9].find('a')['href']  # Assuming download link is in the last column

                all_books.append({
                    'name': name,
                    'author': author,
                    'year': year,
                    'format': file_type,
                    'link': link
                })

    return all_books

# Function to save book metadata to a CSV file
def save_books_to_csv(books, filename='book_metadata.csv'):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['name', 'author', 'year', 'format', 'link'])
        writer.writeheader()
        for book in books:
            writer.writerow(book)

# Main script to search for books and store metadata in CSV
def main():
    query = input("Enter the book title or query: ")
    max_pages = int(input("Enter the number of pages to search (e.g., 10): "))

    # Start the timer
    start_time = time.time()
    
    # Get book metadata with pagination
    books = get_books_with_pagination(query, max_pages=max_pages)

    if books:
        # Save the book metadata to a CSV file
        save_books_to_csv(books, filename='book_metadata.csv')
        print(f"Book metadata saved to 'book_metadata.csv'.")
    else:
        print("No books found for the given query.")
    
    # Stop the timer and calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds")

if __name__ == "__main__":
    main()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Fetching 

In [None]:
import requests
import csv
import time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to handle fetching a single page
def fetch_page(query, page):
    try:
        print(f"Fetching page {page}...")
        
        # Adjust the query URL with pagination and 100 results per page
        base_url = "http://libgen.is/search.php"
        params = {
            "req": query,         # The search query
            "res": "100",         # Limit to 100 results per page
            "view": "simple",     # View in simple format (optional)
            "page": page          # Specify the page number
        }
        
        response = requests.get(base_url, params=params, timeout=10)  # Adding timeout for safety
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table containing search results
        table = soup.find_all('table')[2]  # The third table contains search results
        rows = table.find_all('tr')[1:]  # Skip the header row

        books = []
        for row in rows:
            columns = row.find_all('td')
            if len(columns) >= 9:
                # Extracting relevant metadata from each column
                book_info = {
                    "id": columns[0].get_text(strip=True),
                    "author": columns[1].get_text(strip=True),
                    "name": columns[2].get_text(strip=True),
                    "publisher": columns[3].get_text(strip=True),
                    "year": columns[4].get_text(strip=True),
                    "language": columns[6].get_text(strip=True),
                    "size": columns[7].get_text(strip=True),
                    "format": columns[8].get_text(strip=True),
                    "link": columns[9].find_all('a')[0]['href']
                }
                books.append(book_info)
        return books
    
    except Exception as e:
        print(f"Error fetching page {page}: {e}")
        return []

# Function to handle pagination and collect book metadata using threads
def get_books(query, max_books=100):
    all_books = []
    max_pages = (max_books // 100) + 1  # Calculate the number of pages to fetch

    with ThreadPoolExecutor(max_workers=5) as executor:  # Limit to 5 threads
        future_to_page = {executor.submit(fetch_page, query, page): page for page in range(1, max_pages + 1)}

        for future in as_completed(future_to_page):
            try:
                books = future.result()
                all_books.extend(books)
                
                if len(all_books) >= max_books:
                    break  # Stop once we have enough books
            
            except Exception as e:
                print(f"Error processing future: {e}")
    
    return all_books[:max_books]  # Return only the requested number of books

# Function to save metadata into a CSV file
def save_books_to_csv(books, filename='libgen_books_metadata.csv'):
    if not books:
        print("No books to save.")
        return

    keys = books[0].keys()  # Extract the headers from the first book entry
    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(books)
    print(f"Metadata saved to {filename} successfully.")

# Main script to fetch and store the book metadata
def main():
    query = input("Enter the book title or query: ").replace(" ", "+")
    max_books = int(input("Enter the number of books to fetch (e.g., 100): "))

    # Start timing the execution
    start_time = time.time()

    # Fetch the metadata from multiple pages using threads
    books = get_books(query, max_books)

    # Save the metadata to a CSV file
    if books:
        save_books_to_csv(books)
    else:
        print("No books found for the given query.")

    # Calculate the elapsed time
    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds")

if __name__ == "__main__":
    main()


Fetching page 1...Fetching page 2...

Fetching page 3...
Fetching page 4...
Fetching page 5...
Error fetching page 5: 503 Server Error: Service Temporarily Unavailable for url: http://libgen.is/search.php?req=psychology&res=100&view=simple&page=5
Fetching page 6...
Error fetching page 1: 503 Server Error: Service Temporarily Unavailable for url: http://libgen.is/search.php?req=psychology&res=100&view=simple&page=1
Fetching page 7...
Error fetching page 7: 503 Server Error: Service Temporarily Unavailable for url: http://libgen.is/search.php?req=psychology&res=100&view=simple&page=7Fetching page 8...

Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Error fetching page 8: 503 Server Error: Service Temporarily Unavailable for url: http://libgen.is/search.php?req=psychology&res=100&view=simple&page=8
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Error fetching page 15: 503 Server Error:

Book collection part

In [2]:
import requests
import csv
import time
from bs4 import BeautifulSoup

# Function to handle pagination and collect book metadata
def get_books(query, max_books=100):
    all_books = []
    page = 1

    while len(all_books) < max_books:
        print(f"Fetching page {page}...")

        try:
            # Adjust the query URL with pagination and 100 results per page
            base_url = "http://libgen.is/search.php"
            params = {
                "req": query,         # The search query
                "res": "100",         # Limit to 100 results per page
                "view": "simple",     # View in simple format (optional)
                "page": page          # Specify the page number
            }
            
            response = requests.get(base_url, params=params, timeout=10)  # Adding timeout for safety
            response.raise_for_status()  # Check if the request was successful
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the table containing search results
            table = soup.find_all('table')[2]  # The third table contains search results
            rows = table.find_all('tr')[1:]  # Skip the header row

            if not rows:
                print(f"No more results on page {page}.")
                break  # Stop if no more results are found

            for row in rows:
                if len(all_books) >= max_books:
                    break  # Stop once we have enough books

                columns = row.find_all('td')
                if len(columns) >= 9:
                    # Extracting relevant metadata from each column
                    book_info = {
                        "id": columns[0].get_text(strip=True),
                        "author": columns[1].get_text(strip=True),
                        "name": columns[2].get_text(strip=True),
                        "publisher": columns[3].get_text(strip=True),
                        "year": columns[4].get_text(strip=True),
                        "language": columns[6].get_text(strip=True),
                        "size": columns[7].get_text(strip=True),
                        "format": columns[8].get_text(strip=True),
                        "link": columns[9].find_all('a')[0]['href']
                    }
                    all_books.append(book_info)

            page += 1  # Move to the next page for the next batch of results
        
        except requests.exceptions.RequestException as e:
            print(f"Request failed on page {page}: {e}. Skipping this page...")
            page += 1  # Skip this page and continue to the next
        
        except IndexError:
            print(f"Unexpected page structure encountered on page {page}. Skipping...")
            page += 1  # Skip this page and continue to the next

    return all_books

# Function to save metadata into a CSV file
def save_books_to_csv(books, filename='libgen_books_metadata.csv'):
    if not books:
        print("No books to save.")
        return

    keys = books[0].keys()  # Extract the headers from the first book entry
    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(books)
    print(f"Metadata saved to {filename} successfully.")

# Main script to fetch and store the book metadata
def main():
    query = input("Enter the book title or query: ").replace(" ", "+")
    max_books = int(input("Enter the number of books to fetch (e.g., 100): "))

    # Start timing the execution
    start_time = time.time()

    # Fetch the metadata from multiple pages
    books = get_books(query, max_books)

    # Save the metadata to a CSV file
    if books:
        save_books_to_csv(books)
    else:
        print("No books found for the given query.")

    # Calculate the elapsed time
    elapsed_time = time.time() - start_time
    print(f"Elapsed time: {elapsed_time:.2f} seconds")

if __name__ == "__main__":
    main()


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
Fetching page 31...
Fetching page 32...
Fetching page 33...
Fetching page 34...
Fetching page 35...
Fetching page 36...
Fetching page 37...
Fetching page 38...
Fetching page 39...
Fetching page 40...
Fetching page 41...
Fetching page 42...
Fetching page 43...
Fetching page 44...
Fetching page 45...
Fetching page 46...
Fetching page 47...
Fetching page 48...
Fetching page 49...
Fetching page 50...
Fetching 

In [38]:
import os
import numpy as np
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function to preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Function to map words to numbers
def map_words_to_numbers(tokens):
    word_to_number = {word: idx for idx, word in enumerate(set(tokens), start=1)}
    numbers = [word_to_number[word] for word in tokens]
    return numbers, word_to_number

# Function to store the numbers in a dynamic matrix
def store_in_dynamic_matrix(numbers):
    num_elements = len(numbers)
    # Calculate rows and columns (you can define your logic for dimensions)
    matrix_size = (num_elements // 512 + 1, 512) if num_elements > 512 else (1, num_elements)
    matrix = np.zeros(matrix_size, dtype=np.uint32)
    
    idx = 0
    for number in numbers:
        row = idx // matrix_size[1]
        col = idx % matrix_size[1]
        
        if row < matrix_size[0]:
            matrix[row, col] = number
        idx += 1
    
    return matrix

# Function to read PDF files
def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# Function to process PDF files from a local directory
def process_pdf_files(directory):
    all_texts = []

    # Iterate through all PDF files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            text = read_pdf(file_path)
            all_texts.append(text)

    # Combine all texts into one for processing
    combined_text = ' '.join(all_texts)

    # Preprocess the combined text
    preprocessed_tokens = preprocess_text(combined_text)

    # Map words to numbers
    numbers, word_to_number = map_words_to_numbers(preprocessed_tokens)

    # Store numbers in dynamic matrix
    matrix = store_in_dynamic_matrix(numbers)

    return matrix, word_to_number

# Main entry point for the program
if __name__ == "__main__":
    pdf_directory = input("Enter the path to the local directory containing PDF files: ")
    matrix, word_to_number = process_pdf_files(pdf_directory)

    print("\nMatrix Shape:", matrix.shape)
    print("Matrix Contents:\n", matrix)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET: Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.pdf

Matrix Shape: (65, 512)
Matrix Contents:
 [[2559 1925  253 ...  944 2462 3311]
 [1328  944 1499 ... 1001  735 2125]
 [ 495 1746 1668 ... 2559 1925   32]
 ...
 [2402  295  430 ... 1488 2753 1410]
 [2753 1819 1668 ... 2913    6 3445]
 [ 317   95 1001 ...    0    0    0]]


In [39]:
matrix

array([[2559, 1925,  253, ...,  944, 2462, 3311],
       [1328,  944, 1499, ..., 1001,  735, 2125],
       [ 495, 1746, 1668, ..., 2559, 1925,   32],
       ...,
       [2402,  295,  430, ..., 1488, 2753, 1410],
       [2753, 1819, 1668, ..., 2913,    6, 3445],
       [ 317,   95, 1001, ...,    0,    0,    0]], dtype=uint32)

In [40]:
tokens

['machine',
 'learning',
 'getting',
 'started',
 'microsoft',
 'implement',
 'popular',
 'machine',
 'learning',
 'algorithms',
 'c',
 'jarred',
 'capellman',
 'birmingham',
 'mumbai',
 'machine',
 'learning',
 'copyright',
 '2020',
 'packt',
 'publishing',
 'rights',
 'reserved',
 'part',
 'book',
 'may',
 'reproduced',
 'stored',
 'retrieval',
 'system',
 'transmitted',
 'form',
 'means',
 'without',
 'prior',
 'written',
 'permission',
 'publisher',
 'except',
 'case',
 'brief',
 'quotations',
 'embedded',
 'critical',
 'articles',
 'reviews',
 'every',
 'effort',
 'made',
 'preparation',
 'book',
 'ensure',
 'accuracy',
 'information',
 'presented',
 'however',
 'information',
 'contained',
 'book',
 'sold',
 'without',
 'warranty',
 'either',
 'express',
 'implied',
 'neither',
 'author',
 'packt',
 'publishing',
 'dealers',
 'distributors',
 'held',
 'liable',
 'damages',
 'caused',
 'alleged',
 'caused',
 'directly',
 'indirectly',
 'book',
 'packt',
 'publishing',
 'endeavored

In [41]:
def retrieve_text_from_matrix(matrix, word_to_number):
    # Create a reverse mapping from numbers back to words
    number_to_word = {value: key for key, value in word_to_number.items()}
    
    retrieved_tokens = []
    
    # Iterate over the matrix and retrieve words
    for row in matrix:
        for number in row:
            if number in number_to_word:
                retrieved_tokens.append(number_to_word[number])
    
    return ' '.join(retrieved_tokens)

if __name__ == "__main__":
    # Assuming `matrix` and `word_to_number` are already defined
    retrieved_text = retrieve_text_from_matrix(matrix, word_to_number)
    
    print("\nRetrieved Text:")
    print(retrieved_text)



Retrieved Text:


In [42]:
import numpy as np
import matplotlib.pyplot as plt

def matrix_to_image(matrix, output_path='matrix_image.png'):
    """
    Converts a matrix to an image and saves it.

    Parameters:
    - matrix: np.ndarray
        The matrix to be converted to an image.
    - output_path: str
        The file path to save the image.
    """
    # Normalize the matrix values to range [0, 255]
    normalized_matrix = (matrix / np.max(matrix) * 255).astype(np.uint8)

    # Plot and save the image
    plt.imshow(normalized_matrix, cmap='gray')
    plt.axis('off')  # Hide axes
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    print(f"Matrix image saved to {output_path}")


In [43]:
matrix_to_image(matrix, output_path='smp.png')


Matrix image saved to smp.png


In [1]:
import os
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from docx import Document  # To read DOCX files

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function to preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Function to map words to numbers
def map_words_to_numbers(tokens):
    word_to_number = {word: idx for idx, word in enumerate(set(tokens), start=1)}
    numbers = [word_to_number[word] for word in tokens]
    return numbers, word_to_number

# Function to store the numbers in a dynamic matrix
def store_in_dynamic_matrix(numbers):
    num_elements = len(numbers)
    # Calculate rows and columns (you can define your logic for dimensions)
    matrix_size = (num_elements // 512 + 1, 512) if num_elements > 512 else (1, num_elements)
    matrix = np.zeros(matrix_size, dtype=np.uint32)
    
    idx = 0
    for number in numbers:
        row = idx // matrix_size[1]
        col = idx % matrix_size[1]
        
        if row < matrix_size[0]:
            matrix[row, col] = number
        idx += 1
    
    return matrix

# Function to read DOCX files
def read_docx(file_path):
    text = ""
    doc = Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to process DOCX files from a local directory
def process_docx_files(directory):
    all_texts = []

    # Iterate through all DOCX files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            text = read_docx(file_path)
            all_texts.append(text)

    # Combine all texts into one for processing
    combined_text = ' '.join(all_texts)

    # Preprocess the combined text
    preprocessed_tokens = preprocess_text(combined_text)

    # Map words to numbers
    numbers, word_to_number = map_words_to_numbers(preprocessed_tokens)

    # Store numbers in dynamic matrix
    matrix = store_in_dynamic_matrix(numbers)

    return matrix, word_to_number

# Main entry point for the program
if __name__ == "__main__":
    docx_directory = input("Enter the path to the local directory containing DOCX files: ")
    matrix, word_to_number = process_docx_files(docx_directory)

    print("\nMatrix Shape:", matrix.shape)
    print("Matrix Contents:\n", matrix)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET_ Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.docx

Matrix Shape: (58, 512)
Matrix Contents:
 [[2368 3244 1865 ... 1875  470 3523]
 [3445 1875 1301 ... 2752 1406   94]
 [1849 1952 1569 ...  462  888  244]
 ...
 [ 878  428 3004 ... 2510 2878 1384]
 [3035 2617 1816 ... 2151  244 1663]
 [ 149 2661 1384 ...    0    0    0]]


In [2]:
def retrieve_text_from_matrix(matrix, word_to_number):
    # Create a reverse mapping from numbers back to words
    number_to_word = {value: key for key, value in word_to_number.items()}
    
    retrieved_tokens = []
    
    # Iterate over the matrix and retrieve words
    for row in matrix:
        for number in row:
            if number in number_to_word:
                retrieved_tokens.append(number_to_word[number])
    
    return ' '.join(retrieved_tokens)

if __name__ == "__main__":
    # Assuming `matrix` and `word_to_number` are already defined
    retrieved_text = retrieve_text_from_matrix(matrix, word_to_number)
    
    print("\nRetrieved Text:")
    print(retrieved_text)



Retrieved Text:


In [3]:
import numpy as np
import matplotlib.pyplot as plt

def matrix_to_image(matrix, output_path='matrix_image.png'):
    """
    Converts a matrix to an image and saves it.

    Parameters:
    - matrix: np.ndarray
        The matrix to be converted to an image.
    - output_path: str
        The file path to save the image.
    """
    # Normalize the matrix values to range [0, 255]
    normalized_matrix = (matrix / np.max(matrix) * 255).astype(np.uint8)

    # Plot and save the image
    plt.imshow(normalized_matrix, cmap='gray')
    plt.axis('off')  # Hide axes
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
    plt.close()
    print(f"Matrix image saved to {output_path}")


In [4]:
matrix_to_image(matrix, output_path='smp_docx.png')


Matrix image saved to smp_docx.png


In [9]:
import os
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from docx import Document  # To read DOCX files

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Function to preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Function to map words to numbers
def map_words_to_numbers(tokens):
    word_to_number = {word: idx for idx, word in enumerate(set(tokens), start=1)}
    numbers = [word_to_number[word] for word in tokens]
    return numbers, word_to_number

# Function to store the numbers in a dynamic matrix
def store_in_dynamic_matrix(numbers):
    num_elements = len(numbers)
    # Calculate rows and columns (you can define your logic for dimensions)
    matrix_size = (num_elements // 512 + 1, 512) if num_elements > 512 else (1, num_elements)
    matrix = np.zeros(matrix_size, dtype=np.uint32)
    
    idx = 0
    for number in numbers:
        row = idx // matrix_size[1]
        col = idx % matrix_size[1]
        
        if row < matrix_size[0]:
            matrix[row, col] = number
        idx += 1
    
    return matrix

# Function to read DOCX files
def read_docx(file_path):
    text = ""
    doc = Document(file_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to process DOCX files from a local directory
def process_docx_files(directory):
    all_texts = []

    # Iterate through all DOCX files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            text = read_docx(file_path)
            all_texts.append(text)

    # Combine all texts into one for processing
    combined_text = ' '.join(all_texts)

    # Preprocess the combined text
    preprocessed_tokens = preprocess_text(combined_text)

    # Map words to numbers
    numbers, word_to_number = map_words_to_numbers(preprocessed_tokens)

    # Store numbers in dynamic matrix
    matrix = store_in_dynamic_matrix(numbers)

    return matrix, word_to_number, preprocessed_tokens  # Also return the tokens for evaluation

# Function to retrieve text from matrix
def retrieve_text_from_matrix(matrix, word_to_number):
    # Create a reverse mapping from numbers back to words
    number_to_word = {value: key for key, value in word_to_number.items()}
    
    retrieved_tokens = []
    
    # Iterate over the matrix and retrieve words
    for row in matrix:
        for number in row:
            if number in number_to_word:
                retrieved_tokens.append(number_to_word[number])
    
    return ' '.join(retrieved_tokens)

# Function to evaluate the retrieved text
def evaluate_retrieved_text(original_tokens, retrieved_text):
    original_text = ' '.join(original_tokens)  # Original text from preprocessed tokens
    print("\nOriginal Preprocessed Text:")
    print(original_text)

    print("\nRetrieved Text from Matrix:")
    print(retrieved_text)

    # Calculate the accuracy of retrieval
    if original_text == retrieved_text:
        print("\nEvaluation Result: The retrieved text matches the original preprocessed text.")
    else:
        print("\nEvaluation Result: There are differences between the original and retrieved text.")
        # Optionally, you can perform more detailed evaluation like word-level comparison.

# Main entry point for the program
if __name__ == "__main__":
    docx_directory = input("Enter the path to the local directory containing DOCX files: ")
    matrix, word_to_number, original_tokens = process_docx_files(docx_directory)

    print("\nMatrix Shape:", matrix.shape)

    # Retrieve text from matrix
    retrieved_text = retrieve_text_from_matrix(matrix, word_to_number)

    # Evaluate the retrieved text
    evaluate_retrieved_text(original_tokens, retrieved_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/namanmuktha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET_ Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.docx

Matrix Shape: (58, 512)

Original Preprocessed Text:

Retrieved Text from Matrix:

Evaluation Result: The retrieved text matches the original preprocessed text.


In [6]:
import os
import numpy as np
import docx

# Function to map words to numbers without preprocessing
def map_words_to_numbers(text):
    words = text.split()  # Split text by spaces to get individual words
    word_to_number = {word: idx for idx, word in enumerate(set(words), start=1)}
    numbers = [word_to_number[word] for word in words]
    return numbers, word_to_number

# Function to store the numbers in a dynamic matrix
def store_in_dynamic_matrix(numbers):
    num_elements = len(numbers)
    # Calculate rows and columns
    matrix_size = (num_elements // 512 + 1, 512) if num_elements > 512 else (1, num_elements)
    matrix = np.zeros(matrix_size, dtype=np.uint32)
    
    idx = 0
    for number in numbers:
        row = idx // matrix_size[1]
        col = idx % matrix_size[1]
        
        if row < matrix_size[0]:
            matrix[row, col] = number
        idx += 1
    
    return matrix

# Function to read DOCX files
def read_docx(file_path):
    doc = docx.Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

# Function to process DOCX files from a local directory
def process_docx_files(directory):
    all_texts = []

    # Iterate through all DOCX files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith('.docx'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            text = read_docx(file_path)
            all_texts.append(text)

    # Combine all texts into one for processing
    combined_text = ' '.join(all_texts)

    # Map words to numbers
    numbers, word_to_number = map_words_to_numbers(combined_text)

    # Store numbers in dynamic matrix
    matrix = store_in_dynamic_matrix(numbers)

    return matrix, word_to_number, combined_text  # Return combined text for evaluation

# Function to retrieve text from matrix
def retrieve_text_from_matrix(matrix, word_to_number):
    # Create a reverse mapping from numbers back to words
    number_to_word = {value: key for key, value in word_to_number.items()}
    
    retrieved_words = []
    
    # Iterate over the matrix and retrieve words
    for row in matrix:
        for number in row:
            if number in number_to_word:
                retrieved_words.append(number_to_word[number])
    
    return ' '.join(retrieved_words)

# Function to evaluate the retrieved text
def evaluate_retrieved_text(original_text, retrieved_text):
    print("\nOriginal Text:")
    print(original_text)

    print("\nRetrieved Text from Matrix:")
    print(retrieved_text)

    # Simple evaluation: compare original text and retrieved text
    if original_text == retrieved_text:
        print("\nEvaluation Result: The retrieved text matches the original text.")
    else:
        print("\nEvaluation Result: There are differences between the original and retrieved text.")

# Main entry point for the program
if __name__ == "__main__":
    docx_directory = input("Enter the path to the local directory containing DOCX files: ")
    matrix, word_to_number, original_text = process_docx_files(docx_directory)

    print("\nMatrix Shape:", matrix.shape)

    # Retrieve text from matrix
    retrieved_text = retrieve_text_from_matrix(matrix, word_to_number)

    # Evaluate the retrieved text
    evaluate_retrieved_text(original_text, retrieved_text)


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET_ Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.docx

Matrix Shape: (102, 512)

Original Text:



Hands-On Machine Learning with ML.NET

Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C#







Jarred Capellman










BIRMINGHAM - MUMBAI

Hands-On Machine Learning with ML.NET
Copyright © 2020 Packt Publishing

All rights reserved. No part of this book may be reproduced, stored in a retrieval system, or transmitted in any form or by any means, without the prior written permission of the publisher, except in the case of brief quotations embedded in critical articles or reviews.

Every effort has been made in the preparation of this book to ensure the accuracy of the information presented. However, the information contained in this book is sold without warranty, either express

In [7]:
matrix_to_image(matrix, output_path='smp_docx_np.png')


Matrix image saved to smp_docx_np.png


In [8]:
def calculate_storage(matrix):
    num_elements = matrix.size  # Total number of elements in the matrix
    storage_per_element = 4  # Each element is 32 bits (4 bytes)
    total_storage_bytes = num_elements * storage_per_element  # Total storage in bytes
    
    # Convert to more human-readable units
    total_storage_kb = total_storage_bytes / 1024  # Convert to kilobytes
    total_storage_mb = total_storage_kb / 1024  # Convert to megabytes
    total_storage_gb = total_storage_mb / 1024  # Convert to gigabytes
    
    return total_storage_bytes, total_storage_kb, total_storage_mb, total_storage_gb

if __name__ == "__main__":
    # Example usage with the generated matrix
    matrix, word_to_number, original_text = process_docx_files(docx_directory)
    
    total_storage_bytes, total_storage_kb, total_storage_mb, total_storage_gb = calculate_storage(matrix)
    
    print(f"Total Storage: {total_storage_bytes:.2f} bytes")
    print(f"Total Storage: {total_storage_kb:.2f} KB")
    print(f"Total Storage: {total_storage_mb:.2f} MB")
    print(f"Total Storage: {total_storage_gb:.4f} GB")


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET_ Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.docx
Total Storage: 208896.00 bytes
Total Storage: 204.00 KB
Total Storage: 0.20 MB
Total Storage: 0.0002 GB


In [10]:
def calculate_storage(matrix):
    num_elements = matrix.size  # Total number of elements in the matrix
    storage_per_element = 4  # Each element is 32 bits (4 bytes)
    total_storage_bytes = num_elements * storage_per_element  # Total storage in bytes
    
    # Convert to more human-readable units
    total_storage_kb = total_storage_bytes / 1024  # Convert to kilobytes
    total_storage_mb = total_storage_kb / 1024  # Convert to megabytes
    total_storage_gb = total_storage_mb / 1024  # Convert to gigabytes
    
    return total_storage_bytes, total_storage_kb, total_storage_mb, total_storage_gb

if __name__ == "__main__":
    # Example usage with the generated matrix
    matrix, word_to_number, original_text = process_docx_files(docx_directory)
    
    total_storage_bytes, total_storage_kb, total_storage_mb, total_storage_gb = calculate_storage(matrix)
    
    print(f"Total Storage: {total_storage_bytes:.2f} bytes")
    print(f"Total Storage: {total_storage_kb:.2f} KB")
    print(f"Total Storage: {total_storage_mb:.2f} MB")
    print(f"Total Storage: {total_storage_gb:.4f} GB")


Processing file: /Users/namanmuktha/Desktop/rp_us/downloads/Hands-On Machine Learning with ML.NET_ Getting started with Microsoft ML.NET to implement popular machine learning algorithms in C# 1789801788, 9781789801781.docx
Total Storage: 118784.00 bytes
Total Storage: 116.00 KB
Total Storage: 0.11 MB
Total Storage: 0.0001 GB


In [12]:
import os
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

# Download pre-trained word2vec embeddings
word2vec_model = api.load("glove-wiki-gigaword-300")  # 300-dimensional GloVe embeddings

# Function to map words to embedding vectors
def map_words_to_embeddings(tokens, word2vec_model):
    word_to_embedding = {}
    embedding_matrix = []

    for token in tokens:
        if token in word2vec_model:
            embedding = word2vec_model[token]
            word_to_embedding[token] = embedding
            embedding_matrix.append(embedding)
        else:
            # For unknown words, use a zero vector (or random vector)
            embedding = np.zeros(word2vec_model.vector_size)
            word_to_embedding[token] = embedding
            embedding_matrix.append(embedding)

    return np.array(embedding_matrix), word_to_embedding

# Function to process text and convert to embeddings matrix
def process_text_to_embeddings(text, word2vec_model):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercased text
    embedding_matrix, word_to_embedding = map_words_to_embeddings(tokens, word2vec_model)
    return embedding_matrix, word_to_embedding

# Example of using the function
if __name__ == "__main__":
    text = "The king and the queen ruled the kingdom with wisdom."
    
    # Process text into an embedding matrix
    embedding_matrix, word_to_embedding = process_text_to_embeddings(text, word2vec_model)
    
    print("Embedding Matrix Shape:", embedding_matrix.shape)
    print("Embedding Matrix Contents:\n", embedding_matrix)


ModuleNotFoundError: No module named 'numpy.char'

In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for text
def get_bert_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="tf", padding=True, truncation=True)
    outputs = bert_model(inputs)
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings

# Example: Get embeddings for a sample sentence
text = ["The quick brown fox jumps over the lazy dog."]
embeddings = get_bert_embeddings(text)
print(embeddings.shape)  # Should be (1, 768) for a single sentence embedding


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on,

(1, 768)


In [3]:
embeddings

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-1.44648151e-02, -7.48864636e-02,  5.63669913e-02,
         4.51662159e-03,  4.08910841e-01,  2.58028191e-02,
        -7.56115541e-02,  4.74532843e-01, -1.89379358e-03,
        -1.50111154e-01, -1.01188637e-01, -1.57179251e-01,
        -2.17045680e-01, -1.62536707e-02, -4.55245525e-01,
        -2.51909345e-01,  2.02564016e-01, -2.01035812e-02,
        -1.62171587e-01, -8.59452132e-03,  1.98374614e-01,
        -3.76500368e-01, -5.14896750e-01, -6.73202500e-02,
         4.75526929e-01,  2.27018476e-01, -3.73802590e-03,
         2.44786844e-01, -3.71535748e-01,  1.74580906e-02,
         2.21929863e-01, -1.33087948e-01, -1.10193091e-02,
         1.49264827e-01, -1.69181511e-01, -3.36892046e-02,
         4.03274111e-02, -3.54932904e-01, -4.48146492e-01,
         8.78668651e-02, -2.45351076e-01, -5.45016192e-02,
        -8.58487263e-02, -8.21206793e-02,  1.00634843e-01,
        -4.11890745e-01,  6.95776567e-02, -2.25566044e-01,
      

In [4]:
from tensorflow.keras import layers, models

# Define the simple autoencoder
input_dim = 768  # Input size (same as BERT embedding size)
encoding_dim = 128  # Latent space size for compression

# Encoder
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(512, activation='relu')(input_layer)
encoded = layers.Dense(256, activation='relu')(encoded)
encoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # Compress to 128 dimensions

# Decoder
decoded = layers.Dense(256, activation='relu')(encoded)
decoded = layers.Dense(512, activation='relu')(decoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)  # Reconstruct back to 768 dimensions

# Autoencoder Model
autoencoder = models.Model(input_layer, decoded)

# Compile the model with Mean Squared Error loss (to minimize reconstruction error)
autoencoder.compile(optimizer='adam', loss='mse')

# Display the model architecture
autoencoder.summary()


In [5]:
# Sample texts (you can replace this with your actual text data)
texts = [
    "The cat sat on the mat.",
    "Dogs are loyal pets.",
    "The sun rises in the east.",
    "Artificial intelligence is the future."
]

# Convert the texts into BERT embeddings
train_data = get_bert_embeddings(texts)

# Train the autoencoder
num_epochs = 100
batch_size = 2

history = autoencoder.fit(
    train_data,  # Input (original embeddings)
    train_data,  # Target (same as input, we are reconstructing it)
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=2
)

# Save the trained model
autoencoder.save('text_autoencoder.h5')


Epoch 1/100
2/2 - 1s - 332ms/step - loss: 0.1268
Epoch 2/100
2/2 - 0s - 7ms/step - loss: 0.1058
Epoch 3/100
2/2 - 0s - 9ms/step - loss: 0.0768
Epoch 4/100
2/2 - 0s - 9ms/step - loss: 0.0543
Epoch 5/100
2/2 - 0s - 8ms/step - loss: 0.0452
Epoch 6/100
2/2 - 0s - 9ms/step - loss: 0.0394
Epoch 7/100
2/2 - 0s - 8ms/step - loss: 0.0351
Epoch 8/100
2/2 - 0s - 8ms/step - loss: 0.0314
Epoch 9/100
2/2 - 0s - 8ms/step - loss: 0.0267
Epoch 10/100
2/2 - 0s - 8ms/step - loss: 0.0230
Epoch 11/100
2/2 - 0s - 8ms/step - loss: 0.0174
Epoch 12/100
2/2 - 0s - 8ms/step - loss: 0.0149
Epoch 13/100
2/2 - 0s - 8ms/step - loss: 0.0114
Epoch 14/100
2/2 - 0s - 8ms/step - loss: 0.0089
Epoch 15/100
2/2 - 0s - 8ms/step - loss: 0.0062
Epoch 16/100
2/2 - 0s - 8ms/step - loss: 0.0049
Epoch 17/100
2/2 - 0s - 9ms/step - loss: 0.0037
Epoch 18/100
2/2 - 0s - 8ms/step - loss: 0.0028
Epoch 19/100
2/2 - 0s - 7ms/step - loss: 0.0026
Epoch 20/100
2/2 - 0s - 7ms/step - loss: 0.0021
Epoch 21/100
2/2 - 0s - 7ms/step - loss: 0.0019



In [6]:
# Test on a new sample text
test_text = ["Machine learning is transforming the world."]
test_embedding = get_bert_embeddings(test_text)

# Pass through the autoencoder
compressed_embedding = autoencoder.predict(test_embedding)

print("Original embedding shape:", test_embedding.shape)
print("Compressed and reconstructed embedding shape:", compressed_embedding.shape)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Original embedding shape: (1, 768)
Compressed and reconstructed embedding shape: (1, 768)


In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 1: Load BERT for Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True, max_length=128)
    outputs = bert_model(**inputs)
    # Use CLS token (first token) for sentence embeddings
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings


# Step 2: Define the Autoencoder Architecture
class Autoencoder(tf.keras.Model):
    def __init__(self, encoding_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128,activation='relu'),
            tf.keras.layers.Dense(encoding_dim, activation='relu')
        ])
        # Decoder
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(128,activation='relu'),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(768, activation='sigmoid')
        ])

    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

# Step 3: Instantiate and Compile the Autoencoder
encoding_dim = 128  # Dimension of the compressed latent space
autoencoder = Autoencoder(encoding_dim)
autoencoder.compile(optimizer='adam', loss='mse')

# Step 4: Prepare Text Data and Get Embeddings
texts = [
    "In recent years, artificial intelligence (AI) has rapidly transformed industries across the globe. From healthcare to finance, AI is driving efficiency, enhancing decision-making, and unlocking new possibilities. One of the most exciting developments is the rise of generative AI, which allows machines to create content like text, images, and even music. With large language models like GPT, machines are now capable of understanding and generating human-like text, opening up new avenues for automation, content creation, and personalized recommendations. As AI continues to evolve, its integration with everyday tasks and industries will only become more seamless, fundamentally changing how we live and work."
]

# Get BERT embeddings for each text
embeddings = tf.concat([get_bert_embeddings(text) for text in texts], axis=0)

# Step 5: Train the Autoencoder on BERT Embeddings
autoencoder.fit(embeddings, embeddings, epochs=100, batch_size=2)

# Step 6: Compress and Reconstruct the Text Embeddings
compressed_embeddings = autoencoder.encoder(embeddings)
reconstructed_embeddings = autoencoder.decoder(compressed_embeddings)

# Step 7: Generate Text from GPT-2 (Skipping BERT Embedding Reconstruction)
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def generate_text_with_gpt2(prompt_text):
    # GPT-2 generates text based on prompt text
    input_ids = gpt2_tokenizer.encode(prompt_text, return_tensors='pt')
    output = gpt2_model.generate(
        input_ids=input_ids,
        max_length=250,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_p=0.95,
        temperature=0.9,
    )
    generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Step 8: Generate Text from Original Text Prompts
for i, original_text in enumerate(texts):
    generated_text = generate_text_with_gpt2(original_text)
    print(f"Original Text: {original_text}")
    print(f"Generated Text: {generated_text}")
    print()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on,

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Original Text: In recent years, artificial intelligence (AI) has rapidly transformed industries across the globe. From healthcare to finance, AI is driving efficiency, enhancing decision-making, and unlocking new possibilities. One of the most exciting developments is the rise of generative AI, which allows machines to create content like text, images, and even music. With large language models like GPT, machines are now capable of understanding and generating human-like text, opening up new avenues for automation, content creation, and personalized recommendations. As AI continues to evolve, its integration with everyday tasks and industries will only become more seamless, fundamentally changing how we live and work.
Generated Text: In recent years, artificial intelligence (AI) has rapidly transformed industries across the globe. From healthcare to finance, AI is driving efficiency, enhancing decision-making, and unlocking new possibilities. One of the most exciting developments is th