In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = ['for', 'is', 'this']

def top_distinctive_words(documents):
    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words=stop_words)

    # Fit and transform the input documents
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    # Get the feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Initialize a list to store the top distinctive words for each document
    top_words_list = []

    # Iterate through the TF-IDF matrices for each document
    for tfidf_vector in tfidf_matrix.toarray():
        # Create a list of tuples (word, TF-IDF score) for the current document
        word_tfidf_tuples = [(feature, tfidf_score) for feature, tfidf_score in zip(feature_names, tfidf_vector) if tfidf_score > 0]

        # Sort the tuples by TF-IDF score in descending order
        word_tfidf_tuples.sort(key=lambda x: x[1], reverse=True)

        # Select the top five words with the highest TF-IDF scores
        top_words = [word for word, _ in word_tfidf_tuples[:5]]

        top_words_list.append(top_words)

    return top_words_list

# Example usage:
documents = [
    "This is a sample document about TF-IDF. TF-IDF is important in text analysis.",
    "TF-IDF stands for Term Frequency-Inverse Document Frequency.",
    "Another document for TF-IDF demonstration.",
]

top_words_list = top_distinctive_words(documents)

# Print the top distinctive words for each document
for i, top_words in enumerate(top_words_list):
    print(f"Top distinctive words for document {i + 1}:")
    print(top_words)
    print()


Top distinctive words for document 1:
['idf', 'tf', 'about', 'analysis', 'important']

Top distinctive words for document 2:
['frequency', 'inverse', 'stands', 'term', 'document']

Top distinctive words for document 3:
['another', 'demonstration', 'document', 'idf', 'tf']



In [6]:
import pandas as pd

# Sample DataFrame
data = {
    'category': ['A', 'B', 'A', 'B'],
    'text': [
        "This is category A text.",
        "Category B text with TF-IDF.",
        "More category A text.",
        "Additional category B text."
    ]
}

df = pd.DataFrame(data)

# Define the top_distinctive_words function (from the previous response)

# Create a function to apply top_distinctive_words to each group
def apply_top_distinctive_words(group):
    return top_distinctive_words(group['text'].tolist())

# Group the DataFrame by 'category' and apply the function
result = df.groupby('category').apply(apply_top_distinctive_words)

# Display the result
result

category
A           [[category, text], [more, category, text]]
B    [[idf, tf, with, category, text], [additional,...
dtype: object