# Import Libraries

In [1]:
!pip install gdown



In [2]:
import numpy as np
import pandas as pd
import gdown

# Use NLTK for preprocessing
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Use sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Load Dataset

In [3]:
# Download dataset , originally from https://www.kaggle.com/datasets/snapcrack/all-the-news/data
link = "https://drive.google.com/file/d/1qQozIu2yKbGpUt-tIkDNmo5G6BEzgKb3/view?usp=drive_link"
gdown.download(link, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1qQozIu2yKbGpUt-tIkDNmo5G6BEzgKb3
To: /content/articles1.csv
100%|██████████| 204M/204M [00:00<00:00, 208MB/s]


'articles1.csv'

In [5]:
# Use only 50000 rows of data
df1 = pd.read_csv("/content/articles1.csv")

raw_df = df1.copy()
raw_df = raw_df.reset_index(drop=True)

In [6]:
raw_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [7]:
raw_df.shape

(50000, 10)

# Data Preprocessing

In [8]:
df = raw_df.copy()

In [9]:
# Remove unnecessary columns, only use id, title, and content
df.drop(columns = ['Unnamed: 0','publication', 'author', 'date', 'year' , 'month' , 'url'], inplace = True)

In [10]:
df.isna().sum()

id         0
title      0
content    0
dtype: int64

In [11]:
# Remove data with null
df = df.dropna()
df.isna().sum()

id         0
title      0
content    0
dtype: int64

In [12]:
# Remove duplicate data
df=df.drop_duplicates()
df.duplicated().sum()

0

In [13]:
df.head()

Unnamed: 0,id,title,content
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."


In [14]:
# Column for text processed content
df['content_processed'] = df['content']

In [15]:
# Column for text processed title
df['title_processed'] = df['title']

In [16]:
df.head()

Unnamed: 0,id,title,content,content_processed,title_processed
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,WASHINGTON — Congressional Republicans have...,House Republicans Fret About Winning Their Hea...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...","After the bullet shells get counted, the blood...",Rift Between Officers and Residents as Killing...
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...","When Walt Disney’s “Bambi” opened in 1942, cri...","Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...","Death may be the great equalizer, but it isn’t...","Among Deaths in 2016, a Heavy Toll in Pop Musi..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...","SEOUL, South Korea — North Korea’s leader, ...",Kim Jong-un Says North Korea Is Preparing to T...


# Lowercase & Remove Stopwords

In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
print("Starting Lowercase & Remove Stopwords Process")
for i in range(len(df["title_processed"])):

    result = ""
    for word in df["title_processed"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
            result += word + " "
    df.at[i,"title_processed"] = result.strip()

    result = ""
    for word in df["content_processed"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
            result += word + " "
    df.at[i,"content_processed"] = result.strip()

    percentage = (i + 1) / len(df["title_processed"]) * 100

    # Print progress at 25%, 50%, 75%, and 100%
    if percentage in [25, 50, 75, 100]:
        print(f'Progress: {percentage:.2f}%')

print("Lowercase & Remove Stopwords Process Done!")

Starting Lowercase & Remove Stopwords Process
Progress: 25.00%
Progress: 50.00%
Progress: 75.00%
Progress: 100.00%
Lowercase & Remove Stopwords Process Done!


In [19]:
df.head()

Unnamed: 0,id,title,content,content_processed,title_processed
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans new fear...,house republicans fret winning health care sui...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",bullet shells get counted blood dries votive c...,rift officers residents killings persist south...
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...",walt disneys bambi opened 1942 critics praised...,tyrus wong bambi artist thwarted racial bias d...
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...",death may great equalizer isnt necessarily eve...,among deaths 2016 heavy toll pop music new yo...
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...",seoul south korea north koreas leader kim sai...,kim jongun says north korea preparing test lon...


# Lemmatization

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
print("Starting Lemmatization Process")

for i in range(len(df["title_processed"])):

    result = ""
    for w in word_tokenize(df["title_processed"][i]):
        result += lemmatizer.lemmatize(w,pos = "v") + " "
    df.at[i, "title_processed"] = result.strip()

    result = ""
    for w in word_tokenize(df["content_processed"][i]):
        result += lemmatizer.lemmatize(w,pos = "v") + " "
    df.at[i, "content_processed"] = result.strip()

    percentage = (i + 1) / len(df["title_processed"]) * 100

    # Print progress at 25%, 50%, 75%, and 100%
    if percentage in [25, 50, 75, 100]:
        print(f'Progress: {percentage:.2f}%')

print("Lemmatization Process Done!")

Starting Lemmatization Process
Progress: 25.00%
Progress: 50.00%
Progress: 75.00%
Progress: 100.00%
Lemmatization Process Done!


In [22]:
# Combine processed title and processed content
df['combined_processed'] = df['title_processed'].astype(str) + ' ' + df['content_processed']

In [23]:
df.head()

Unnamed: 0,id,title,content,content_processed,title_processed,combined_processed
0,17283,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...,washington congressional republicans new fear ...,house republicans fret win health care suit ne...,house republicans fret win health care suit ne...
1,17284,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood...",bullet shell get count blood dry votive candle...,rift officer residents kill persist south bron...,rift officer residents kill persist south bron...
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri...",walt disneys bambi open 1942 critics praise sp...,tyrus wong bambi artist thwart racial bias die...,tyrus wong bambi artist thwart racial bias die...
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t...",death may great equalizer isnt necessarily eve...,among deaths 2016 heavy toll pop music new yor...,among deaths 2016 heavy toll pop music new yor...
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ...",seoul south korea north koreas leader kim say ...,kim jongun say north korea prepare test longra...,kim jongun say north korea prepare test longra...


# Bag of Words

In [24]:
vectorizer = CountVectorizer()
content_processed_features   = vectorizer.fit_transform(df['content_processed'])
combined_processed_features   = vectorizer.fit_transform(df['combined_processed'])

In [25]:
content_processed_features.get_shape()

(50000, 173539)

In [26]:
def find_relevant_articles(news_id, features , news_count = 10 ):
    # Calculate euclidean distance of queried article with other articles
    couple_dist = pairwise_distances(features, features[news_id])
    indices = np.argsort(couple_dist.ravel())[0:news_count+1]
    sorted_dist = np.sort(couple_dist.ravel()[0:news_count+1])

    relevant_articles = df.iloc[indices]
    relevant_articles = pd.DataFrame(relevant_articles)
    relevant_articles["euclidean_similarity_value_with_queried_article"] = sorted_dist

    print('ID : ',df['id'][indices[0]])
    print('Title : ',df['title'][indices[0]])

    print("Recommended articles : ")
    show_columns = ['id','title', 'euclidean_similarity_value_with_queried_article']
    pd.set_option('display.max_colwidth', None)
    return relevant_articles[show_columns].iloc[1:,]

### Bag of words using article content only

In [27]:
find_relevant_articles(1, content_processed_features)


ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
7718,26425,One Police Shift: Patrolling an Anxious America - The New York Times,143.387587
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,145.068949
458,17801,"A Bullet Misses Its Mark, and Then Takes a Fatal Detour - The New York Times",145.996575
3232,20865,"In Week of Emotional Swings, Police Face a Dual Role: Villain and Victim - The New York Times",146.338648
41608,62776,Inside the mind of a black Baltimore cop,146.362564
7687,26379,"A Mother Is Shot Dead on a Playground, and a Sea of Witnesses Goes Silent - The New York Times",147.678705
41377,62463,"There aren’t more police shootings, just more coverage",147.972971
5551,23508,Have the Dallas Police Improved? Depends on Whom You Ask - The New York Times,148.563118
3317,20964,Baton Rouge Shooting Jolts a Nation on Edge - The New York Times,149.254816
3432,21093,"After Dallas Attack, Hundreds of Applicants Answer a Call for Backup - The New York Times",161.006211


### Bag of words using article title and content

In [28]:
find_relevant_articles(1, combined_processed_features)


ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
7718,26425,One Police Shift: Patrolling an Anxious America - The New York Times,143.874946
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,145.715476
3232,20865,"In Week of Emotional Swings, Police Face a Dual Role: Villain and Victim - The New York Times",146.611732
458,17801,"A Bullet Misses Its Mark, and Then Takes a Fatal Detour - The New York Times",146.938763
7687,26379,"A Mother Is Shot Dead on a Playground, and a Sea of Witnesses Goes Silent - The New York Times",147.057812
41608,62776,Inside the mind of a black Baltimore cop,148.11482
5551,23508,Have the Dallas Police Improved? Depends on Whom You Ask - The New York Times,148.768276
41377,62463,"There aren’t more police shootings, just more coverage",149.509197
3317,20964,Baton Rouge Shooting Jolts a Nation on Edge - The New York Times,149.783177
3432,21093,"After Dallas Attack, Hundreds of Applicants Answer a Call for Backup - The New York Times",162.061717


# Bag of Words with normalized vector

In [29]:
normalized_content_processed_features = normalize(content_processed_features, norm='l2', axis=1)
normalized_combined_processed_features = normalize(combined_processed_features, norm='l2', axis=1)


### Bag of words with normalized vector using article content only

In [30]:
find_relevant_articles(1, normalized_content_processed_features)

ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,1.196929
7718,26425,One Police Shift: Patrolling an Anxious America - The New York Times,1.203108
3232,20865,"In Week of Emotional Swings, Police Face a Dual Role: Villain and Victim - The New York Times",1.208305
7687,26379,"A Mother Is Shot Dead on a Playground, and a Sea of Witnesses Goes Silent - The New York Times",1.227811
458,17801,"A Bullet Misses Its Mark, and Then Takes a Fatal Detour - The New York Times",1.244021
2793,20392,"A Drumbeat of Multiple Shootings, but America Isn’t Listening - The New York Times",1.253032
6864,25269,Minority Youths Mistrust Police. A Brooklyn High School Has a Plan. - The New York Times,1.254696
5325,23230,Pleading for Peace in Chicago Amid Fears of a Bloody Summer - The New York Times,1.275806
41608,62776,Inside the mind of a black Baltimore cop,1.290221
5551,23508,Have the Dallas Police Improved? Depends on Whom You Ask - The New York Times,1.310301


### Bag of words using article title and content

In [31]:
find_relevant_articles(1, normalized_combined_processed_features)

ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,1.197764
7718,26425,One Police Shift: Patrolling an Anxious America - The New York Times,1.200216
3232,20865,"In Week of Emotional Swings, Police Face a Dual Role: Villain and Victim - The New York Times",1.209518
7687,26379,"A Mother Is Shot Dead on a Playground, and a Sea of Witnesses Goes Silent - The New York Times",1.226925
458,17801,"A Bullet Misses Its Mark, and Then Takes a Fatal Detour - The New York Times",1.242335
2793,20392,"A Drumbeat of Multiple Shootings, but America Isn’t Listening - The New York Times",1.251408
6864,25269,Minority Youths Mistrust Police. A Brooklyn High School Has a Plan. - The New York Times,1.25261
5325,23230,Pleading for Peace in Chicago Amid Fears of a Bloody Summer - The New York Times,1.273457
41608,62776,Inside the mind of a black Baltimore cop,1.289565
5551,23508,Have the Dallas Police Improved? Depends on Whom You Ask - The New York Times,1.302727


# TF-IDF

In [32]:
tfidf_vectorizer = TfidfVectorizer()
content_processed_features = tfidf_vectorizer.fit_transform(df['content_processed'])
combined_processed_features = tfidf_vectorizer.fit_transform(df['combined_processed'])

### TF-IDF using article content only

In [33]:
find_relevant_articles(1, content_processed_features)

ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,1.356083
47846,70757,8 popular American foods that aren’t what you think they are,1.359055
47615,70471,The story behind Steph Curry’s all-white ’dad shoes’ that blew up the internet,1.363026
46862,69515,This map shows how much money you need to make to live in the 7 biggest US cities,1.363306
44928,67092,Listen to audio of a GOP candidate from Montana allegedly ’body slamming’ a reporter,1.369726
47656,70519,"What happened when the Sam Adams founder told his dad he was quitting a $250,000-a-year job to make beer",1.37306
45609,67930,Madonna cofounded a startup that manufactures viral dance trends — and ’Whip/Nae Nae’ was its first monster hit,1.37856
44834,66981,Mitch McConnell credits former Fox News CEO Roger Ailes for jump-starting his political career,1.379474
47726,70606,"Here’s how the Bastille Day attack unfolded in Nice, France",1.380492
47791,70688,Ivanka Trump is speaking at the GOP convention — here are 12 things you might not know about her,1.384796


### TF-IDF using article title and content

In [34]:
find_relevant_articles(1, combined_processed_features)

ID :  17284
Title :  Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Recommended articles : 


Unnamed: 0,id,title,euclidean_similarity_value_with_queried_article
1008,18426,Police to Strengthen Force in New York’s Most Violent Borough - The New York Times,1.35652
7687,26379,"A Mother Is Shot Dead on a Playground, and a Sea of Witnesses Goes Silent - The New York Times",1.359056
458,17801,"A Bullet Misses Its Mark, and Then Takes a Fatal Detour - The New York Times",1.363239
3232,20865,"In Week of Emotional Swings, Police Face a Dual Role: Villain and Victim - The New York Times",1.364546
7718,26425,One Police Shift: Patrolling an Anxious America - The New York Times,1.370087
22269,41019,"Delegates Are Ball Players In the Game, Voters Are Spectators in the Bleachers - Breitbart",1.372995
6241,24443,"Slain San Antonio Detective Was Targeted, Police Chief Says - The New York Times",1.377437
2793,20392,"A Drumbeat of Multiple Shootings, but America Isn’t Listening - The New York Times",1.378573
53,17345,"Drop in Gang Violence Drove New York City Shootings Below 1,000 in 2016 - The New York Times",1.379507
4193,21937,"In Las Vegas, Rising Murders Strain a Police Force Used to Solving Them - The New York Times",1.384513


# Kesimpulan
Terdapat beberapa cara untuk mencari artikel-artikel yang relevan dengan suatu artikel. Contohnya adalah menggunakan Bag of Words dan TF-IDF. Percobaan ini memberikan hasil
1. Bag of Words akan menghasilkan nilai euclidean similarity yang besar, oleh sebab itu perlu dilakukan normalisasi vektor pada fitur Bag of Words
2. Untuk 10 artikel rekomendasi dengan metode Bag of Words dan Bag of Words dengan Normalized Vector , penggabungan judul artikel dan konten artikel tidak memberikan hasil artikel pilihan yang berbeda dengan hanya menggunakan konten artikel
3. Untuk 10 artikel rekomendasi dengan metode TF-IDF , penggabungan judul artikel dan konten artikel memberikan hasil yang berbeda dengan hanya menggunakan konten artikel. Untuk fitur yang hanya menggunakan konten artikel, terdapat beberapa artikel yang tidak relevan dengan artikel yang dibaca.