# News Articles Clustering and Recommendation
This notebook performs clustering on synthetic news articles using TF-IDF, SVD for visualization, and KMeans for clustering. It also includes a cosine similarity-based recommendation system.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity


## 1. Dataset Creation (Synthetic Example)

In [None]:
data = {
    'id': range(1, 11),
    'title': [
        "Breaking: New policy changes announced",
        "Celebrity caught in scandalous event",
        "Scientists discover new element",
        "Fake news about politician's arrest",
        "Health benefits of green tea",
        "Hoax: Alien invasion story",
        "New technology revolutionizes industry",
        "False report on stock market crash",
        "Local team wins championship",
        "Fake claim about vaccine dangers"
    ],
    'content': [
        "Government announced new policy changes today impacting the economy significantly.",
        "A popular celebrity was reportedly involved in a scandalous event last night.",
        "Scientists have discovered a new chemical element that could change materials science.",
        "Reports about a politician's arrest have been proven false by official sources.",
        "Studies reveal health benefits of drinking green tea daily.",
        "An alien invasion story is circulating online but has been confirmed as hoax.",
        "Innovative technology is set to revolutionize the manufacturing industry.",
        "False reports claim that the stock market crashed today causing panic.",
        "The local sports team won the championship in a thrilling final game.",
        "There are fake claims spreading misinformation about vaccine dangers."
    ]
}

df = pd.DataFrame(data)

## 2. Dataset Preview

In [None]:
print("Dataset Preview:")
print(df.head())

NameError: name 'df' is not defined

## 3. Data Preprocessing

In [None]:
# Check for missing values and duplicates
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:\n", df.duplicated().sum())

No missing or duplicates to handle here.

Vectorize the 'content' column using TF-IDF.

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(df['content'])

print("\nTF-IDF matrix shape:", X_tfidf.shape)

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Histogram of article content length
df['content_length'] = df['content'].apply(len)

plt.figure(figsize=(8,4))
plt.hist(df['content_length'], bins=5, color='skyblue')
plt.title('Distribution of News Article Content Length')
plt.xlabel('Content Length (characters)')
plt.ylabel('Number of Articles')
plt.grid(axis='y')
plt.show()

In [None]:
# Dimensionality reduction for visualization
svd = TruncatedSVD(n_components=2, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

plt.figure(figsize=(8,6))
plt.scatter(X_reduced[:,0], X_reduced[:,1], c='blue')
for i, title in enumerate(df['title']):
    plt.text(X_reduced[i,0]+0.01, X_reduced[i,1], str(i+1))
plt.title('2D Projection of News Articles via TruncatedSVD')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.show()

## 5. Feature Engineering
No extra features, just TF-IDF and SVD components.

## 6. Model Building

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_tfidf)

df['cluster'] = clusters

print("\nCluster Assignments:")
print(df[['title', 'cluster']])

## 7. Model Evaluation

In [None]:
sil_score = silhouette_score(X_tfidf, clusters)
print("\nSilhouette Score for clustering:", sil_score)

In [None]:
# Cosine similarity heatmap
cos_sim_matrix = cosine_similarity(X_tfidf)

plt.figure(figsize=(10,8))
sns.heatmap(cos_sim_matrix, annot=True, fmt=".2f", cmap='coolwarm',
            xticklabels=df['id'], yticklabels=df['id'])
plt.title('Cosine Similarity Heatmap of News Articles')
plt.xlabel('Article ID')
plt.ylabel('Article ID')
plt.show()

## 8. Sample recommendation function

In [None]:
def recommend_similar_articles(article_id, top_n=3):
    idx = df.index[df['id'] == article_id].tolist()[0]
    sim_scores = list(enumerate(cos_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # exclude the article itself

    print(f"Articles similar to '{df.loc[idx, 'title']}':")
    for i, score in sim_scores:
        print(f"  - {df.loc[i, 'title']} (Similarity: {score:.2f})")

# Example usage:
recommend_similar_articles(article_id=4)  # Pick an article to test recommendations