In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Convert to DataFrame
df = pd.read_csv("BBC News Train.csv")

In [3]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
df.shape

(1490, 3)

## Using Cosine Similarity

In [5]:
# Preprocessing and vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Text'])

In [6]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 213866 stored elements and shape (1490, 24456)>

In [7]:
# Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
cosine_sim

array([[1.        , 0.01361175, 0.01403186, ..., 0.03403756, 0.01904735,
        0.01310931],
       [0.01361175, 1.        , 0.09153991, ..., 0.08192573, 0.03405815,
        0.01489347],
       [0.01403186, 0.09153991, 1.        , ..., 0.04428513, 0.04458582,
        0.03509888],
       ...,
       [0.03403756, 0.08192573, 0.04428513, ..., 1.        , 0.03968907,
        0.01465114],
       [0.01904735, 0.03405815, 0.04458582, ..., 0.03968907, 1.        ,
        0.01856889],
       [0.01310931, 0.01489347, 0.03509888, ..., 0.01465114, 0.01856889,
        1.        ]])

In [9]:
def recommend_articles(article_id, df, cosine_sim, top_n=3):
    # Find the index of the input article
    idx = df[df['ArticleId'] == article_id].index[0]
    
    # Get similarity scores for the input article
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort articles by similarity scores (excluding the input article itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of the recommended articles
    article_indices = [i[0] for i in sim_scores]
    
    # Return the recommended articles
    return df.iloc[article_indices][['ArticleId', 'Text', 'Category']]


In [11]:
df[['Text']][df['ArticleId']==154]

Unnamed: 0,Text
1,german business confidence slides german busin...


In [12]:
# Example: Recommend articles similar to ArticleID 1
recommended_articles = recommend_articles(article_id=154, df=df, cosine_sim=cosine_sim)
print("Recommended Articles:")
print(recommended_articles)

Recommended Articles:
     ArticleId                                               Text  Category
57          40  german growth goes into reverse germany s econ...  business
360       1374  uk economy facing  major risks  the uk manufac...  business
332        173  australia rates at four year high australia is...  business


In [10]:
df[df['ArticleId']==154]

Unnamed: 0,ArticleId,Text,Category
1,154,german business confidence slides german busin...,business


## Using BERT / Sentence Transformers

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the text column to get embeddings
embeddings = model.encode(df['Text'].tolist(), show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 47/47 [00:04<00:00, 11.73it/s]


In [12]:
# Compute cosine similarity between all article embeddings
cosine_sim = cosine_similarity(embeddings, embeddings)

In [23]:
def recommend_articles(input_text, df, model, embeddings, top_n=3):
    """
    Recommend similar articles based on input text using pre-trained embeddings,
    excluding the input text itself from the recommendations.
    """
    # Encode the input text into the same embedding space
    input_embedding = model.encode([input_text])
    
    # Compute cosine similarity between input and all article embeddings
    sim_scores = cosine_similarity(input_embedding, embeddings).flatten()
    
    # Sort articles by similarity score
    sim_scores_indices = sorted(enumerate(sim_scores), key=lambda x: x[1], reverse=True)
    
    # Filter out the input article (similarity score = 1.0)
    filtered_indices = [i[0] for i in sim_scores_indices if sim_scores[i[0]] < 0.9999]
    
    # Get the top N similar articles (excluding the input)
    top_indices = filtered_indices[:top_n]
    
    # Return the recommended articles
    return df.iloc[top_indices][['ArticleId', 'Text', 'Category']]


In [24]:
input_article = df.loc[df['ArticleId'] == 154, 'Text'].values[0]

In [25]:
input_article

'german business confidence slides german business confidence fell in february knocking hopes of a speedy recovery in europe s largest economy.  munich-based research institute ifo said that its confidence index fell to 95.5 in february from 97.5 in january  its first decline in three months. the study found that the outlook in both the manufacturing and retail sectors had worsened. observers had been hoping that a more confident business sector would signal that economic activity was picking up.   we re surprised that the ifo index has taken such a knock   said dz bank economist bernd weidensteiner.  the main reason is probably that the domestic economy is still weak  particularly in the retail trade.  economy and labour minister wolfgang clement called the dip in february s ifo confidence figure  a very mild decline . he said that despite the retreat  the index remained at a relatively high level and that he expected  a modest economic upswing  to continue.  germany s economy grew 1.

In [26]:
recommended_articles = recommend_articles(input_article, df, model, embeddings, top_n=3)
print("Recommended Articles:")
print(recommended_articles)

Recommended Articles:
     ArticleId                                               Text  Category
57          40  german growth goes into reverse germany s econ...  business
979        705  newest eu members underpin growth the european...  business
109        334  ecb holds rates amid growth fears the european...  business
