In [None]:
pip install transformers

In [None]:
pip install nltk

In [None]:
import re
import nltk
import numpy as np
import math
from math import log
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from transformers import pipeline
# Compute TF-IDF using scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
# Initialize NLTK's Porter Stemmer
stemmer = PorterStemmer()

In [None]:
# Initialize the text generation pipeline with the desired model
generator = pipeline('text-generation', model='gpt2')

# Define the prompts
topics = [
    "machine learning",
    "Lionel Messi"
]

# List to store generated documents
generated_documents = []

# Generate text based on each topic and store the documents
for i, topic in enumerate(topics, start=1):
    generated_text = generator(topic, max_length=50)
    generated_documents.append(generated_text[0]['generated_text'])

# Print the generated documents
print("Generated Documents:")
for i, doc in enumerate(generated_documents, start=1):
    print("Document", i, ":", doc)
    print()

In [None]:
# Initialize NLTK's stopwords
stop_words = set(stopwords.words('english'))

# Define a function to clean and normalize the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove symbols and characters
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    # Split the text into words
    words = cleaned_text.split()
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    # Stem each word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    return stemmed_words

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in generated_documents]

In [None]:
# Compute Term Frequency (TF)
def compute_tf(document):
    tf = Counter(document)
    for word in tf:
        tf[word] = tf[word] / len(document)
    return tf

# Compute Inverse Document Frequency (IDF)
def compute_idf(documents):
    idf = {}
    total_docs = len(documents)
    words_in_documents = [set(document) for document in documents]
    all_words = set().union(*words_in_documents)
    for word in all_words:
        doc_count = sum([1 for document in documents if word in document])
        idf[word] = log(total_docs / (doc_count + 1))
    return idf

# Compute TF-IDF
def compute_tfidf(documents):
    tfidf = []
    idf = compute_idf(documents)
    for document in documents:
        tf = compute_tf(document)
        tfidf_doc = {word: tf[word] * idf[word] for word in tf}
        tfidf.append(tfidf_doc)
    return tfidf

# Preprocess the documents
preprocessed_documents = [' '.join(doc) for doc in preprocessed_documents]

# Compute TF-IDF using scikit-learn
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)
feature_names = vectorizer.get_feature_names_out()

# Print TF-IDF using scikit-learn
print("TF-IDF using scikit-learn:")
for i in range(len(preprocessed_documents)):
    print("Document", i+1, ":")
    for j, word in enumerate(feature_names):
        tfidf_value = tfidf_matrix[i, j]
        if tfidf_value != 0:
            print(word, ":", tfidf_value)
    print()
