In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics import pairwise_distances
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

In [55]:
df = pd.read_csv("data.csv")
df.drop(columns=["article_id","source_id","content","full_content"],axis=1,inplace=True)

category_counts = df.groupby("category").size()
sorted_category_counts = category_counts.sort_values(ascending=False)

df = df.groupby("category").filter(lambda x: len(x) >= 500)
df.isnull().sum()
df.dropna(inplace=True)

In [56]:
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

In [57]:
df["title_lang"] = df["title"].apply(detect_language)
df["description_lang"] = df["description"].apply(detect_language)

In [58]:
df = df[(df["title_lang"] == "en") & (df["description_lang"] == "en")]
df.drop(columns=['title_lang', 'description_lang'], inplace=True)

In [59]:
#Encoding of source and category features

label_encoder_source = LabelEncoder()
df["source_encoded"] = label_encoder_source.fit_transform(df["source_name"])

label_encoder_category = LabelEncoder()
df["category_encoded"] = label_encoder_category.fit_transform(df["category"])

Using Tf-idf vectorizer to identify less frequently occuring words and giving them appropriate weights

In [60]:
tfid_vectorizer_title = TfidfVectorizer(max_features=10000)
X_tfid_title = tfid_vectorizer_title.fit_transform(df["title"])

tfid_vectorizer_desc = TfidfVectorizer(max_features=10000)
X_tfid_desc = tfid_vectorizer_desc.fit_transform(df["description"])

#Stacking the features horizontally
tfidf_combined_features = hstack([X_tfid_title, X_tfid_desc])

Stacking the categorical features and vectorized data

In [61]:
def tfidf_based_model(row_index, num_similar_items):

    couple_dist = pairwise_distances(tfidf_combined_features, tfidf_combined_features[row_index], metric='euclidean')
    
    indices = np.argsort(couple_dist.ravel())[1:num_similar_items + 1]

    df_similar = pd.DataFrame({
        'title': df['title'].iloc[indices].values,
        'description': df['description'].iloc[indices].values,
        'source': df['source_name'].iloc[indices].values,
        'category': df['category'].iloc[indices].values,
        'Euclidean similarity with the queried article': couple_dist[indices].ravel()
    })
    
    print("=" * 30, "Queried article details", "=" * 30)
    print('Title : ', df['title'].iloc[row_index])
    print('Description : ', df['description'].iloc[row_index])
    print('Source : ', df['source_name'].iloc[row_index])
    print('Category : ', df['category'].iloc[row_index])
    print("\n", "=" * 25, "Recommended articles : ", "=" * 23)

    return df_similar


In [66]:
result_df = tfidf_based_model(2000, 11)
print(result_df)

Title :  NFL DFS, Steelers vs. Titans: DraftKings, FanDuel daily Fantasy football picks on Thursday Night Football
Description :  Daily Fantasy millionaire Mike McClure shares his top NFL DFS picks for Tennessee Titans vs. Pittsburgh Steelers on TNF
Source :  CBS Sports
Category :  Games

                                                title  \
0   NFL DFS Thursday Night Football picks: Steeler...   
1   NFL DFS, Week 9: Top DraftKings, FanDuel daily...   
2   Titans vs. Steelers odds, line, spread: Thursd...   
3   How to watch tonight's Tennessee Titans vs. Pi...   
4   Thursday Night Football odds, spread, line: St...   
5   NBA DFS: Top DraftKings, FanDuel daily Fantasy...   
6   Thursday Night Football Betting Promos: Titans...   
7   Thursday Night Football: Titans lead Steelers ...   
8   Steelers vs. Titans props, odds, best bets, AI...   
9   Report: Titans' DeAndre Hopkins Expected to Pl...   
10  Steelers vs. Titans: Time, live stream, odds, ...   

                         