In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

import os
import math
import time

import matplotlib.pyplot as plt
import seaborn as sns


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

In [None]:
dataset = pd.read_json("/content/drive/My Drive/newsdataset.json",lines = True)

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   category           200853 non-null  object        
 1   headline           200853 non-null  object        
 2   authors            200853 non-null  object        
 3   link               200853 non-null  object        
 4   short_description  200853 non-null  object        
 5   date               200853 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


In [None]:
dataset.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV",Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-amanda-painter-mass-shooting_us_5b081ab4e4b0802d69caad89,She left her husband. He killed their children. Just another day in America.,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song,Andy McDonald,https://www.huffingtonpost.com/entry/will-smith-joins-diplo-and-nicky-jam-for-the-official-2018-world-cup-song_us_5b09726fe4b0fdb2aa541201,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-grant-marries_us_5b09212ce4b0568a880b9a8c,The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carrey-adam-schiff-democrats_us_5b0950e8e4b0fdb2aa53e675,The actor gives Dems an ass-kicking for not fighting hard enough against Donald Trump.,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-margulies-trump-poop-bag_us_5b093ec2e4b0fdb2aa53df70,"The ""Dietland"" actress said using the bags is a ""really cathartic, therapeutic moment.""",2018-05-26


In [None]:
dataset = dataset[dataset['headline'].apply(lambda x:len(x.split())>5)]
print(dataset.shape[0])

180543


In [None]:
dataset  = dataset[dataset['date']>=pd.Timestamp(2018,1,1)]

In [None]:
print(dataset.shape[0])

8530


In [None]:
dataset.sort_values('headline',inplace=True,ascending=False)
dups = dataset.duplicated('headline',keep = False)
dataset = dataset[~dups]
print(dataset.shape[0])

8485


In [None]:
dataset.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

In [None]:
print(dataset['category'].nunique())

26


In [None]:
categories = dataset.groupby('category')
print('total categories: {}'.format(categories.ngroups))
print(categories.size())

total categories: 26
category
ARTS & CULTURE      13
BLACK VOICES       406
BUSINESS            85
COLLEGE              1
COMEDY             443
CRIME              170
EDUCATION           31
ENTERTAINMENT     1699
GREEN               28
HEALTHY LIVING      15
IMPACT              73
LATINO VOICES       83
MEDIA              290
PARENTS             32
POLITICS          3042
QUEER VOICES       451
RELIGION            63
SCIENCE             40
SPORTS             364
STYLE               34
TASTE                9
TECH                53
TRAVEL              72
WEIRD NEWS         205
WOMEN              226
WORLD NEWS         557
dtype: int64


In [None]:
dataset.index = range(dataset.shape[0])

In [None]:
dataset["day and month"] = dataset["date"].dt.strftime("%a") + "_" + dataset["date"].dt.strftime("%b")

In [None]:
dataset_cp = dataset.copy()

In [None]:
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
for i in range(len(dataset_cp["headline"])):
    string = ""
    for word in dataset_cp["headline"][i].split():
        ss = ("".join(e for e in word if e.isalnum()))
        ss = ss.lower()
        if not ss in stop:
          string += ss + " "  
    if(i%1000==0):
      print(i)           # To track number of records processed
    dataset_cp.at[i,"headline"] = string.strip()

0
1000
2000
3000
4000
5000
6000
7000
8000


In [None]:
dataset_cp.head()

Unnamed: 0,category,headline,authors,link,short_description,date,day and month
0,QUEER VOICES,grace creator donate gay bunny book every grade school indiana,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grace-creator-donate-john-olivers-gay-bunny-book-to-every-elementary-school-in-indiana_us_5ac28265e4b00fa46f854225,It's about to be a lot easier for kids in Mike Pence's home state to read “A Day in the Life of Marlon Bundo.”,2018-04-02,Mon_Apr
1,QUEER VOICES,voice blind auditions make history first trans contestant,"Lyndsey Parker, Yahoo Entertainment",https://www.huffingtonpost.com/entry/the-voice-blind-auditions-make-history-with-first-trans-contestant_us_5a9ece6ee4b002df2c5e39c2,"Austin Giorgio, 21: “How Sweet It Is (To Be Loved by You)” Young crooners have appeared on singing competitions since “American",2018-03-06,Tue_Mar
2,QUEER VOICES,penumbra queer audio drama didnt know needed,"Sarah Emily Baum, ContributorFreelance Writer",https://www.huffingtonpost.com/entry/the-penumbra-is-the-queer-audio-drama-you-didnt_us_5a48f900e4b0df0de8b06b29,"Young, fun, fantastical and, most notably, inclusive, the show is a must-listen for young queer people.",2018-01-05,Fri_Jan
3,COMEDY,opposition gives trump hot lawyer,Ed Mazza,https://www.huffingtonpost.com/entry/trump-hot-lawyer_us_5af5381ee4b0e57cd9f7fe84,"He's here to make a ""strong case"" for the president.",2018-05-11,Fri_May
4,ENTERTAINMENT,stranger things fans able visit upside irl,Elyse Wanshel,https://www.huffingtonpost.com/entry/stranger-things-fans-will-be-able-to-visit-the-upside-down-irl_us_5ac3e3a6e4b063ce2e56c26a,"Hawkins is headed to Hollywood, Orlando and Singapore this fall.",2018-04-03,Tue_Apr


In [None]:
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('punkt')
nltk.download('wordnet')
 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
for i in range(len(dataset_cp['headline'])):
    string = ""
    for w in word_tokenize(dataset_cp['headline'][i]):
        string += lemmatizer.lemmatize(w,pos='v') + " "
    dataset_cp.at[i,'headline'] = string.strip()
    

In [None]:
dataset_cp.head()

Unnamed: 0,category,headline,authors,link,short_description,date,day and month
0,QUEER VOICES,grace creator donate gay bunny book every grade school indiana,Elyse Wanshel,https://www.huffingtonpost.com/entry/will-grace-creator-donate-john-olivers-gay-bunny-book-to-every-elementary-school-in-indiana_us_5ac28265e4b00fa46f854225,It's about to be a lot easier for kids in Mike Pence's home state to read “A Day in the Life of Marlon Bundo.”,2018-04-02,Mon_Apr
1,QUEER VOICES,voice blind audition make history first trans contestant,"Lyndsey Parker, Yahoo Entertainment",https://www.huffingtonpost.com/entry/the-voice-blind-auditions-make-history-with-first-trans-contestant_us_5a9ece6ee4b002df2c5e39c2,"Austin Giorgio, 21: “How Sweet It Is (To Be Loved by You)” Young crooners have appeared on singing competitions since “American",2018-03-06,Tue_Mar
2,QUEER VOICES,penumbra queer audio drama didnt know need,"Sarah Emily Baum, ContributorFreelance Writer",https://www.huffingtonpost.com/entry/the-penumbra-is-the-queer-audio-drama-you-didnt_us_5a48f900e4b0df0de8b06b29,"Young, fun, fantastical and, most notably, inclusive, the show is a must-listen for young queer people.",2018-01-05,Fri_Jan
3,COMEDY,opposition give trump hot lawyer,Ed Mazza,https://www.huffingtonpost.com/entry/trump-hot-lawyer_us_5af5381ee4b0e57cd9f7fe84,"He's here to make a ""strong case"" for the president.",2018-05-11,Fri_May
4,ENTERTAINMENT,stranger things fan able visit upside irl,Elyse Wanshel,https://www.huffingtonpost.com/entry/stranger-things-fans-will-be-able-to-visit-the-upside-down-irl_us_5ac3e3a6e4b063ce2e56c26a,"Hawkins is headed to Hollywood, Orlando and Singapore this fall.",2018-04-03,Tue_Apr


In [None]:
headline_vectorizer = CountVectorizer()
headline_features = headline_vectorizer.fit_transform(dataset_cp['headline'])

In [None]:
print(headline_features.shape)

(8485, 11122)


In [None]:
pd.set_option('display.max_colwidth', None) 

In [None]:
def bag_of_words(row_index,sim_items):
    couple_dist = pairwise_distances(headline_features,headline_features[sim_items])
    indices = np.argsort(couple_dist.ravel())[0:sim_items]
    df = pd.DataFrame({'publish_date':dataset['date'][indices].values,
                      'headline':dataset['headline'][indices].values,
                      'similarity':couple_dist[indices].ravel()})
    print("-"*30,"Queried article details ","-"*30)
    print("The headline is ",dataset['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    return df.iloc[1:,]

In [None]:
bag_of_words(133,11)

------------------------------ Queried article details  ------------------------------
The headline is  ‘RuPaul’s Drag Race All Stars 3’ Episode 6 Recap: Which Queen Returned To The Competition?



Unnamed: 0,publish_date,headline,similarity
1,2018-03-16,'RuPaul's Drag Race All Stars 3' Episode 8 Recap: Which Queen Snatched The Crown?,2.0
2,2018-02-02,‘RuPaul’s Drag Race All Stars 3’ Episode 2 Recap: Sour Milk,2.236068
3,2018-02-12,‘RuPaul’s Drag Race All Stars 3’ Episode 3 Recap: Who Won The Bitchelor's Heart?,2.236068
4,2018-02-16,‘RuPaul’s Drag Race All Stars 3’ Episode 4 Recap: Who Won Snatch Game?,2.236068
5,2018-01-29,‘RuPaul’s Drag Race All Stars 3’ Episode 1 Recap: Which Queen Went Home First?,2.236068
6,2018-03-09,‘RuPaul’s Drag Race All Stars 3’ Episode 7 Recap: We Can Never Go Back To Before,2.44949
7,2018-02-23,‘RuPaul’s Drag Race All Stars 3’ Episode 5 Recap: The Warhol Ball Crowns One Pop Art Queen,2.828427
8,2018-03-08,"'RuPaul's Drag Race' Reveals Guest Judges, Stars For Season 10",3.162278
9,2018-03-16,Trixie Mattel Sounds Off On Shocking 'RuPaul's Drag Race All Stars' Win,3.162278
10,2018-01-18,'RuPaul's Drag Race' Stars Open Up About Mental Health And The Toll Of Superstardom,3.162278


In [None]:
tfidfvect = TfidfVectorizer(min_df=0)
tfidf_headline_features = tfidfvect.fit_transform(dataset_cp['headline'])


In [None]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': dataset['date'][indices].values,
               'headline':dataset['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',dataset['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    return df.iloc[1:,]
tfidf_based_model(132, 11)

headline :  Woman Given 5 Years In Prison For Illegally Voting Says She Didn't Know She Was Ineligible



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-04-05,Trump Claims Without Evidence That Millions Of People Are Voting Illegally In California,1.211374
2,2018-04-05,Donald Trump Says He Didn't Know About Stormy Daniels Payment,1.216741
3,2018-02-23,6 Things You Didn't Know About Michael B. Jordan,1.231291
4,2018-02-16,Zendaya Wants You To Know She Didn’t Give Blake Lively The Side-Eye,1.238326
5,2018-03-11,Trump Brags That He Won Most Of The Women's Vote In 2016. He Didn't.,1.248094
6,2018-01-22,"Pennsylvania Supreme Court Strikes Down State's Congressional Map, Saying It Illegally Benefits GOP",1.266391
7,2018-05-18,Bill Gates Says Trump Didn't Know The Difference Between HPV And HIV,1.267833
8,2018-05-07,Connecticut To Give Its Electoral College Votes To National Popular Vote Victor,1.269602
9,2018-03-22,Innocent Man Awarded $1 Million After Spending 31 Years In Prison,1.273952
10,2018-02-09,Omarosa Turns On Trump: Wouldn't Vote For Him Again 'In A Million Years',1.276054
