In [2]:
import pandas as pd
import numpy as np


In [4]:
# Load original data (if needed again)
column_names = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'events']
df = pd.read_csv(r'C:\Users\Admin\Downloads\archive\news.tsv\news.tsv', sep='\t', names=column_names, header=None)
df['content'] = df['title'].fillna('') + ' ' + df['abstract'].fillna('')

# Load clean text (if continuing from last notebook)
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_content'] = df['content'].apply(preprocess_text)

# TF-IDF transformation
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['clean_content'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Let's simulate a user liking 3 articles by index
liked_article_indices = [5, 20, 50]  # change these to real IDs if needed

# Show the liked articles
df.iloc[liked_article_indices][['title', 'abstract']]


Unnamed: 0,title,abstract
5,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...
20,Elijah Cummings to lie in state at US Capitol ...,"Cummings, a Democrat whose district included s..."
50,40+ Stuffed Pasta Recipes You'll Want To Make ...,Stuff yourself.


In [6]:
# Get TF-IDF vectors of liked articles
user_profile_matrix = tfidf_df.iloc[liked_article_indices]

# Average them to form the user profile
user_profile_vector = user_profile_matrix.mean(axis=0)


In [7]:
# Show top terms in the user profile vector
top_terms = user_profile_vector.sort_values(ascending=False).head(20)
print("Top interests of the user:")
print(top_terms)


Top interests of the user:
officiating    0.189334
pasta          0.169911
cummings       0.165341
stuff          0.164510
fine           0.158006
recipe         0.130729
nfl            0.112462
player         0.110428
every          0.109456
want           0.103147
lie            0.090949
make           0.089678
night          0.086172
section        0.083946
elijah         0.083049
capitol        0.082215
week           0.081791
included       0.080784
whose          0.074864
age            0.071907
dtype: float64
