In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the Excel file
file_path = 'news_data_cleaned.xlsx'  # Replace with your actual file path
df = pd.read_excel(file_path)

# Function to extract keywords using TF-IDF
def extract_keywords(texts, top_n=5):
    # Combine the description and content columns for analysis
    combined_text = texts['description'].astype(str) + " " + texts['content'].astype(str)
    
    # Initialize the TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    
    # Fit the vectorizer on the combined text
    X = vectorizer.fit_transform(combined_text)
    
    # Get feature names (keywords)
    feature_names = vectorizer.get_feature_names_out()
    
    # Find the top N keywords for each row
    keywords = []
    for row in X.toarray():
        top_indices = row.argsort()[-top_n:][::-1]  # Get top N indices in descending order
        top_keywords = [feature_names[i] for i in top_indices]
        keywords.append(", ".join(top_keywords))  # Join the keywords as a string
    
    return keywords

# Apply the extract_keywords function to the DataFrame
df['extracted_keywords'] = extract_keywords(df, top_n=5)

# Save the updated DataFrame to a new Excel file
df.to_excel('news_data_with_keywords.xlsx', index=False)

print("Keyword extraction complete. Updated file saved as 'news_data_with_keywords.xlsx'.")


Keyword extraction complete. Updated file saved as 'news_data_with_keywords.xlsx'.
