# Data Preprocessing

In [2]:
# Importing Neccessary Libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
sampleData = pd.read_json('./SampleUserData.json')
sampleData

Unnamed: 0,User_1,User_2
ClickedNews,"[NewsArticle_0, NewsArticle_1, NewsArticle_2, ...","[NewsArticle_1, NewsArticle_2, NewsArticle_3, ..."
ClickedNewsID,"[333, 145, 372, 194]","[187, 492, 445, 5]"
ReadingTimes,"[5, 12, 12, 8, 10]","[12, 14, 8, 6, 10]"
ScrollBehavior,"{'NewsArticle_0': {'20%': 3, '40%': 4, '60%': ...","{'NewsArticle_1': {'0%': 2, '20%': 3, '40%': 1..."
SearchQueries,"[tech, sports, sports]","[AI, politics, health]"
LikedNewsID,"[333, 194]","[492, 187]"
SharedNewsID,"[145, 333]","[187, 492, 5]"
CommentedNews,"{'1': ['Comment 0'], '0': ['Comment 1', 'Comme...","{'187': ['Not great article!', 'Not very helpf..."
BookmarkedNews,"[NewsArticle_4, NewsArticle_2]","[NewsArticle_2, NewsArticle_4]"
DownloadedNews,"[NewsArticle_3, NewsArticle_2]","[NewsArticle_1, NewsArticle_3]"


In [7]:
# Hyperparameters
### Proximity News Data Hyperparameters
MaxNewsID = 500

In [23]:
class ProxiNewsAI_Dataset(Dataset):
    def __init__(self, data, MaxNewsID):
        self.data = data
        self.MaxNewsID = MaxNewsID
    
    def getUsers(self):
        return self.data.columns.tolist()
    
    #TODO: Instead of using List as the Data we need to use PyTorch Tensors.
    #TODO: We need to Normalize the Data with other techniques.

    def ClickedNewsNormalization(self, userID):
        clickedNews = self.data[userID]['ClickedNewsID']

        return [i/self.MaxNewsID for i in clickedNews]
    
    #TODO: Instead of using List as the Data we need to use PyTorch Tensors.
    #TODO: We need to Normalize the Data with other techniques.

    def ReadingTimesNormalization(self, userID):
        readingTimes = self.data[userID]['ReadingTimes']

        return [i/100 for i in readingTimes]
    
    #TODO: Instead of using List as the Data we need to use PyTorch Tensors.
    #TODO: We need to Normalize the Data with other techniques.

    def ScrollBehaviorNormalization(self, userID):
        norm = []
        scrollBehavior = self.data[userID]['ScrollBehavior']
        for NewsArticle in scrollBehavior.keys():

            # ['20%', '40%', '60%', '80%', '100%']
            scrolls = list(scrollBehavior[NewsArticle].values())
            
            # Normalization
            norm.append([i/100 for i in scrolls])
        
        return norm

    #TODO: Need to implement tokenizer for this function

    def SearchQueriesTokenization(self, userID):
        SearchQueries = self.data[userID]['SearchQueries']
        return SearchQueries
    
    #TODO: Instead of using List as the Data we need to use PyTorch Tensors.
    #TODO: We need to Normalize the Data with other techniques.

    def LikedNewsNormalization(self, userID):
        LikedNews = self.data[userID]['LikedNewsID']

        return [i/self.MaxNewsID for i in LikedNews]
    
    #TODO: Instead of using List as the Data we need to use PyTorch Tensors.
    #TODO: We need to Normalize the Data with other techniques.

    def SharedNewsNormalization(self, userID):
        SharedNews = self.data[userID]['SharedNewsID']

        return [i/self.MaxNewsID for i in SharedNews]
    

    def CommentedNewsEncoding(self, userID, newsTitles):
        CommentedNews = self.data[userID]['CommentedNews']
        prompt_template = f"""Here is News Articles With the comments from the User {userID}"""
        for NewsArticle in CommentedNews.keys():
            
            prompt_template += f"""\n{newsTitles[int(NewsArticle)]}\n"""
    
            for idx, comment in enumerate(CommentedNews[NewsArticle]):
                prompt_template += f"""Comment_{idx}: {comment}\n"""
        
        print(prompt_template)

In [24]:
dataset = ProxiNewsAI_Dataset(sampleData, MaxNewsID)

for user in dataset.getUsers():
    CNN = dataset.ClickedNewsNormalization(user)
    RTN = dataset.ReadingTimesNormalization(user)
    SBN = dataset.ScrollBehaviorNormalization(user)
    SQT = dataset.SearchQueriesTokenization(user)
    LNN = dataset.LikedNewsNormalization(user)
    SNN = dataset.SharedNewsNormalization(user)

    print(f'''User: {user}
          
          Clicked News Normalized: {CNN}.
          Reading Time Normalized: {RTN}.
          Scrolling Time data of user: {SBN}.
          Search Queries tokens: {SQT}.
          Liked News Normalization: {LNN}.
          Shared News Normalization: {SNN}.
          ''')

User: User_1
          
          Clicked News Normalized: [0.666, 0.29, 0.744, 0.388].
          Reading Time Normalized: [0.05, 0.12, 0.12, 0.08, 0.1].
          Scrolling Time data of user: [[0.03, 0.04, 0.05, 0.0, 0.05], [0.04, 0.0, 0.05, 0.0, 0.02], [0.01, 0.04, 0.01, 0.02, 0.01], [0.05, 0.02, 0.01, 0.05, 0.05], [0.02, 0.03, 0.0, 0.0, 0.02]].
          Search Queries tokens: ['tech', 'sports', 'sports'].
          Liked News Normalization: [0.666, 0.388].
          Shared News Normalization: [0.29, 0.666].
          
User: User_2
          
          Clicked News Normalized: [0.374, 0.984, 0.89, 0.01].
          Reading Time Normalized: [0.12, 0.14, 0.08, 0.06, 0.1].
          Scrolling Time data of user: [[0.02, 0.03, 0.01, 0.04, 0.05, 0.02], [0.0, 0.02, 0.03, 0.0, 0.01, 0.01], [0.03, 0.05, 0.04, 0.02, 0.0, 0.05], [0.01, 0.0, 0.03, 0.04, 0.01, 0.0], [0.02, 0.04, 0.05, 0.01, 0.03, 0.04]].
          Search Queries tokens: ['AI', 'politics', 'health'].
          Liked News Normaliza

In [25]:
news_headlines = [
    "Artificial Intelligence Advances: ChatGPT Leads the Way in Language Processing",
    "Global Climate Crisis: UN Warns of Rising Sea Levels by 2050",
    "Tech Innovations 2025: Quantum Computing Edges Closer to Mainstream"
]

dataset.CommentedNewsEncoding('User_1', news_headlines)

Here is News Articles With the comments from the User User_1
Global Climate Crisis: UN Warns of Rising Sea Levels by 2050

Comment_0: Comment 0

Artificial Intelligence Advances: ChatGPT Leads the Way in Language Processing

Comment_0: Comment 1

Comment_1: Comment 2

Tech Innovations 2025: Quantum Computing Edges Closer to Mainstream

Comment_0: Comment 2

