In [None]:
pip install transformers

In [None]:
pip install nltk

In [None]:
import re
import nltk
import numpy as np
import math
from math import log
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from transformers import pipeline
# Compute TF-IDF using scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
# Initialize NLTK's Porter Stemmer
stemmer = PorterStemmer()

In [None]:
# Initialize the text generation pipeline with the desired model
generator = pipeline('text-generation', model='gpt2')

# Define the prompts
topics = [
    "machine learning",
    "Lionel Messi"
]

# List to store generated documents
generated_documents = []

# Generate text based on each topic and store the documents
for i, topic in enumerate(topics, start=1):
    generated_text = generator(topic, max_length=50)
    generated_documents.append(generated_text[0]['generated_text'])

# Print the generated documents
print("Generated Documents:")
for i, doc in enumerate(generated_documents, start=1):
    print("Document", i, ":", doc)
    print()

In [None]:
# Initialize NLTK's stopwords
stop_words = set(stopwords.words('english'))

# Define a function to clean and normalize the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove symbols and characters
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    # Split the text into words
    words = cleaned_text.split()
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    return stemmed_words

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in generated_documents]

In [None]:
# Compute TF
def compute_tf(document):
    tf = Counter(document)
    total_terms = len(document)
    tf = {term: freq / total_terms for term, freq in tf.items()}
    return tf

# Compute IDF
def compute_idf(documents):
    idf = {}
    total_docs = len(documents)
    all_terms = set(term for document in documents for term in document)
    for term in all_terms:
        doc_count = sum(1 for document in documents if term in document)
        idf[term] = math.log(1 + doc_count / total_docs)
    return idf

# Compute TF-IDF
def compute_tfidf(document, documents):
    tf = compute_tf(document)
    idf = compute_idf(documents)
    tfidf = {term: tf[term] * idf[term] for term in tf}
    return tfidf

# Preprocess the documents
preprocessed_documents = [' '.join(doc) for doc in preprocessed_documents]

# Compute TF-IDF
tfidf_docs = []
for doc in preprocessed_documents:
    terms = doc.split()
    tfidf = compute_tfidf(terms, preprocessed_documents)
    tfidf_docs.append(tfidf)

# Print TF-IDF
print("TF-IDF:")
for i, doc in enumerate(tfidf_docs, start=1):
    print("Document", i, ":")
    for term, value in doc.items():
        print(term, ":", value)
    print()