In [1]:
import pandas as pd
import os
import random
import json
from google_trans_new import google_translator  

In [2]:
number_of_profiles = 5 # Selecting number of profiles we would like to find news for
profiles_list = os.listdir('3.profile.data') # location of profile information
 


In [1]:
import nltk
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re


def select_profiles(number, profiles, social_network = "twitter"): # social_network could be: "twitter", "instagram", "googlePlus"
    """
    Random profile selection with innitial preprocessing (translation
    to English) and storing to dataframe.
    """
    translator = google_translator() 
    prof_lst = list()
    count = 0
    while count != number:
        prof = random.choice(profiles)
        with open('3.profile.data/' + prof +'/' + 'twitter-'+ prof + '.json') as f:
            prof = json.load(f)
            if prof['bio']:
                if translator.detect(prof['bio'])[0] != 'en':
                    prof['bio'] = translator.translate(prof['bio'], lang_tgt='en')
                prof_lst.append({'FullName' : prof['fullName'], 'Location': prof['location'], "Bio": prof['bio']})
                count +=1
            
            else:
                pass
    return prof_lst


def normalize_document_cust(doc, norm_approach="lem"): # norm_approach could be: "lem", "stem", None
    
    """
    Custom implementation of text normalizaion approach, removing special characters
    and inconsistent whitespacing, word tokenization, removing stopwords
    """
    wpt = nltk.WordPunctTokenizer()
    stop_words = nltk.corpus.stopwords.words('english')

    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.lower()
    doc = doc.strip()
  
    tokens = wpt.tokenize(doc)

    if norm_approach == "stem":
        stem = PorterStemmer()
        filtered_tokens = [stem.stem(token) for token in tokens if token not in stop_words]
    elif norm_approach == "lem":    
        lem = WordNetLemmatizer()
        filtered_tokens = [lem.lemmatize(token, "v") for token in tokens if token not in stop_words]
    else:
        filtered_tokens = [token for token in tokens if token not in stop_words]
        
    return filtered_tokens

In [4]:
# Creating profile dataframe
profiles = pd.DataFrame.from_dict(select_profiles(number = number_of_profiles, profiles = profiles_list))
#profiles.to_csv("profiles.csv", index=False)

In [5]:
profiles.head()

Unnamed: 0,FullName,Location,Bio
0,Francesco Venier,"Trieste, Italy",Prof. @UniTrieste & @MibSchool. Curious about ...
1,خالد صافي #غزةVerified account,Palestine - Gaza غزة فلسطين,It is not ready to die for my thoughts because...
2,Kathrin Aue,Germany,"PR, Strategic & Visual Communication. Japan. Z..."
3,Steven Fekete,"Pasadena, CA",Music. Film. Food. Culture. Theology. \nI Writ...
4,Trying to focus,152 City Rd London EC1V 2NX,Editor & photographer at Corporate Photography...


In [6]:
profiles['Bio'] = profiles['Bio'].apply(normalize_document_cust).apply(" ".join)

In [7]:
# reading news dataset
news = pd.read_csv("news.csv")

In [8]:

news = news.dropna()
#removing news duplicates if any
news = news.drop_duplicates(subset='title', keep="last").reset_index(drop=True)

In [9]:
print(news.shape)
# randomly selecting 10000 news articles
news = news.sample(n=10000).reset_index(drop=True)
news.shape

(163335, 5)


(10000, 5)

In [10]:
news.head()

Unnamed: 0,id,timestamp,source,title,description
0,99105,2020-07-29 22:01:00,Reuters,"Brazil hits record 69,000 coronavirus cases in...",Brazil's coronavirus outbreak set daily record...
1,188880,2020-11-03 11:04:00,Reuters,Timeline: What to expect on U.S. election nigh...,"The coronavirus pandemic, an unprecedented num..."
2,28841,2020-05-01 21:43:36,Reuters,U.S. health official Fauci testimony to Congre...,Top U.S. health official Anthony Fauci has bee...
3,43584,2020-05-25 14:26:02,Bloomberg,Burundi Ruling Party Candidate Wins Election M...,Burundi ruling party candidate Evariste Ndayis...
4,49172,2020-06-03 17:29:03,Reuters,UK's Johnson tells EU workers to come back but...,Workers from the European Union who left Brita...


In [None]:
from gensim.corpora import Dictionary
from gensim import models
from operator import itemgetter
from gensim import similarities
from gensim.models import Word2Vec
from gensim.similarities import WordEmbeddingSimilarityIndex

from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize


def create_dictionary(docs):
    """
    Cration of article-word dictionaries
    """
    pdocs = [normalize_document_cust(doc, "lem") for doc in docs]
    dictionary = Dictionary(pdocs)
    return dictionary


def docs2bows(corpus, dictionary):
    """
    Generation of bag of words
    """
    docs = [normalize_document_cust(d, "lem") for d in corpus]
    vectors = [dictionary.doc2bow(doc) for doc in docs]
    return vectors


def create_TF_IDF_model(corp):
    """
    Conducting TFIDF 
    """
    dictionary = create_dictionary(corp)
    loaded_corpus = docs2bows(corp, dictionary)
    tfidf = models.TfidfModel(loaded_corpus)
    return tfidf, dictionary


def launch_query(corpus, q, w2v = False): # w2v could be: True, "False - decide on approach TFIDF vs Word2Vec
    """
    Main function to run news similarity analysis and recommendation based on profiles:
    Generates the most similar news to the profile description and ranks tham accordingly
    """
    tfidf, dictionary = create_TF_IDF_model(corpus)
    loaded_corpus = docs2bows(corpus, dictionary)
    pq = normalize_document_cust(q, "lem")
    vq = dictionary.doc2bow(pq)
    
    
    if not w2v:
        index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
        qtfidf = tfidf[vq]
        sim = index[qtfidf]
    else:
        model = Word2Vec(corpus, min_count=1)  # train word-vectors
        termsim_index = similarities.WordEmbeddingSimilarityIndex(model.wv)
        sparse = similarities.SparseTermSimilarityMatrix(termsim_index, dictionary)
        index2 = similarities.SoftCosineSimilarity(loaded_corpus, sparse)
        sim = index2[vq]
    
    ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
    top_news = []
    for doc, score in ranking:
        top_news.append("[ Score = " + "%.3f" % round(score,3) + " ] " + news['title'][doc]) # news is global variable
    return top_news    


In [25]:
"""
Here we print the results (top 10 recommended news) for 5 randomly generated profiles: 
    1. NEWS.DESCRIPTION: a)TFIDF,  b) Word2Vec
    2. NEWS.TITLE: a)TFIDF,  b) Word2Vec
"""

def print_res(res):
    for el in res:
        print("* " + el + '\n')


import warnings
warnings.filterwarnings("ignore")

for pr in range(number_of_profiles):
    
    print("Person: " + profiles['FullName'][pr])
    print("BIO: "+ profiles['Bio'][pr])
    print('--------------NEWS.DESCRIPTION---------------------------------------') 
    print("TFIDF")
    result = launch_query(list(news.description), profiles['Bio'][pr])
    print_res(result[:10])
    print("Word2Vec")
    result = launch_query(list(news.description), profiles['Bio'][pr], True)
    print_res(result[:10])
    print('--------------NEWS.TITLE---------------------------------------------')
    print("TFIDF")
    result = launch_query(list(news.title), profiles['Bio'][pr])
    print_res(result[:10])
    print("Word2Vec")
    result = launch_query(list(news.title), profiles['Bio'][pr], True)
    print_res(result[:10])
    print('######################################################################')
    print()

Person: Francesco Venier
BIO: prof unitrieste mibschool curious technology management bmwgssupslowfoodtravel golf photography addict online since
--------------NEWS.DESCRIPTION---------------------------------------
TFIDF
* [ Score = 0.198 ] China's Lufax ties up with Thai bank for local online wealth management - Reuters

* [ Score = 0.181 ] Trump plays golf for first time since declaring coronavirus a national emergency - Reuters

* [ Score = 0.154 ] Amateur Thompson leading the way at U.S. Open - Reuters UK

* [ Score = 0.147 ] Morgan Stanley online platform for wealthy clients down

* [ Score = 0.141 ] Justin Thomas confident he can hold onto No. 1 this time - Reuters

* [ Score = 0.133 ] Ocado faces second AutoStore lawsuit in UK - Reuters

* [ Score = 0.122 ] Transport giants Volvo Group and Daimler Truck team up to focus on fuel-cell technology

* [ Score = 0.119 ] Wirecard innovation team moves to Berlin-based fintech - Reuters

* [ Score = 0.119 ] Britain's Ocado sued by AutoS

100%|████████████████████████████████████████████████████████████████████████| 18278/18278 [00:00<00:00, 192132.33it/s]


* [ Score = 0.535 ] Morgan Stanley online platform for wealthy clients down

* [ Score = 0.535 ] High-ranking auto exec, GM's CFO Suryadevara, lured by tech startup Stripe - Reuters India

* [ Score = 0.535 ] Ocado faces second AutoStore lawsuit in UK - Reuters

* [ Score = 0.535 ] Britain's Ocado sued by AutoStore over alleged patent infringement - Reuters UK

* [ Score = 0.535 ] Apollo-owned cloud company Rackspace shares slump 20% in Nasdaq debut - Reuters India

* [ Score = 0.535 ] CEOs speed up digital push and downsize offices, KPMG survey shows - Reuters

* [ Score = 0.535 ] China's Lufax ties up with Thai bank for local online wealth management - Reuters

* [ Score = 0.535 ] Pandemic forces Europe's largest tech event to go fully online - Reuters India

* [ Score = 0.507 ] Trump plays golf for first time since declaring coronavirus a national emergency - Reuters

* [ Score = 0.378 ] March's ISM manufacturing index is 49.1, signaling contraction as coronavirus hits economy

----

100%|████████████████████████████████████████████████████████████████████████| 11940/11940 [00:00<00:00, 172073.12it/s]


* [ Score = 0.577 ] Trump plays golf for first time since declaring coronavirus a national emergency - Reuters

* [ Score = 0.577 ] China's Lufax ties up with Thai bank for local online wealth management - Reuters

* [ Score = 0.408 ] Top Turkish, Greek diplomats hold first meeting since crisis, agree on talks - Reuters Canada

* [ Score = 0.408 ] Exclusive: Online retailer Boxed explores $1 billion sale - sources - Reuters

* [ Score = 0.408 ] Key dates since the start of the 2001 war in Afghanistan and efforts to broker peace - Reuters

* [ Score = 0.408 ] Canada's RBC turns heads in U.S. with wealth management recruitment push

* [ Score = 0.408 ] Golf: Woods and Mickelson charity match proves a ratings hit

* [ Score = 0.408 ] Ford's new CEO Farley promises urgency at automaker, shakes up management - Reuters

* [ Score = 0.408 ] This sleep expert also had 'weird dreams and nightmares' since Covid-19. Here's what she does now to sleep better

* [ Score = 0.408 ] U.S. COVID-19 singl

100%|████████████████████████████████████████████████████████████████████████| 18278/18278 [00:00<00:00, 192884.99it/s]


* [ Score = 0.530 ] UK to review potentially faulty England COVID-19 death data - Reuters UK

* [ Score = 0.487 ] Two-time WNBA MVP Candace Parker calls league a leading voice for social justice - Reuters

* [ Score = 0.487 ] America's civil rights leader Joseph Lowery dies, aged 98

* [ Score = 0.487 ] Arizona's Hall of Fame coach Olson dies aged 85 - Reuters

* [ Score = 0.487 ] British singer Vera Lynn dies at 103 - PA Media

* [ Score = 0.487 ] Three dead in stabbing attack in English town of Reading: reports

* [ Score = 0.487 ] COVID-19 vaccine may be ready by year-end: WHO's Tedros - Reuters

* [ Score = 0.487 ] 'Cocoon' actor Wilford Brimley dies age 85 - Reuters

* [ Score = 0.487 ] S.Africa will not have universal basic income grant this year, minister says - Reuters India

* [ Score = 0.487 ] Play ball, South Korea steps up to the plate as MLB idles

--------------NEWS.TITLE---------------------------------------------
TFIDF
* [ Score = 0.279 ] Last flight to Bangkok: 'If I 

100%|████████████████████████████████████████████████████████████████████████| 11940/11940 [00:00<00:00, 189063.14it/s]


* [ Score = 0.514 ] Arizona's Hall of Fame coach Olson dies aged 85 - Reuters

* [ Score = 0.514 ] British singer Vera Lynn dies at 103 - PA Media

* [ Score = 0.514 ] COVID-19 vaccine may be ready by year-end: WHO's Tedros - Reuters

* [ Score = 0.514 ] Utah Jazz coach Jerry Sloan dies at 78

* [ Score = 0.485 ] Indian police crackdown on illegal liquor suppliers after 86 die - Reuters

* [ Score = 0.485 ] Rain aids effort to control blazes in U.S. West; one firefighter dies - Reuters India

* [ Score = 0.485 ] Northern Ireland will not be ready for Jan.1 post-Brexit trade checks: watchdog - Reuters

* [ Score = 0.485 ] British great Stirling Moss dies at 90 after long illness

* [ Score = 0.485 ] Tanzania's former president Benjamin Mkapa dies, presidency says

* [ Score = 0.485 ] Second survivor from capsized cattle ship dies - Kyodo news - Reuters India

######################################################################

Person: Kathrin Aue
BIO: pr strategic visual communicatio

100%|████████████████████████████████████████████████████████████████████████| 18278/18278 [00:00<00:00, 200002.32it/s]


* [ Score = 0.535 ] Japan's Motegi says Japan, Australia have special, strategic partnership - Reuters UK

* [ Score = 0.535 ] Japan, Australia, India discuss strategic ties, regional security - Reuters UK

* [ Score = 0.378 ] With shift toward merit-based pay, Japan's Hitachi to drop old ways

* [ Score = 0.378 ] Exclusive: Japan's GPIF to allow investing up to 31% in foreign bonds, sources say

* [ Score = 0.378 ] Japan government says coincident indicator index suggests economy is worsening - Reuters India

* [ Score = 0.378 ] Japan's search for crew of capsized cattle ship still suspended - Reuters

* [ Score = 0.378 ] Honda goes small with first all-electric car - Reuters India

* [ Score = 0.378 ] Exclusive: Online retailer Boxed explores $1 billion sale - sources - Reuters

* [ Score = 0.378 ] Japan health minister Kato likely to replace Suga as chief cabinet secretary - Nippon TV - Reuters UK

* [ Score = 0.378 ] Deflation fears creep back in Japan as pandemic hits prices

----

100%|████████████████████████████████████████████████████████████████████████| 11940/11940 [00:00<00:00, 188658.59it/s]


* [ Score = 0.535 ] Japan, Australia, India discuss strategic ties, regional security - Reuters UK

* [ Score = 0.507 ] Japan's Motegi says Japan, Australia have special, strategic partnership - Reuters UK

* [ Score = 0.378 ] With shift toward merit-based pay, Japan's Hitachi to drop old ways

* [ Score = 0.378 ] Exclusive: Japan's GPIF to allow investing up to 31% in foreign bonds, sources say

* [ Score = 0.378 ] Japan government says coincident indicator index suggests economy is worsening - Reuters India

* [ Score = 0.378 ] Japan's search for crew of capsized cattle ship still suspended - Reuters

* [ Score = 0.378 ] Japan PM Abe's support rebounds despite ex-justice minister's arrest

* [ Score = 0.378 ] Japan July retail sales fall 2.8% year/year - govt - Reuters India

* [ Score = 0.378 ] Syrian artist's Silent Demonstration watches over German war crimes trial

* [ Score = 0.378 ] Japan health minister Kato likely to replace Suga as chief cabinet secretary - Nippon TV - Reute

100%|████████████████████████████████████████████████████████████████████████| 18278/18278 [00:00<00:00, 192338.16it/s]


* [ Score = 0.447 ] U.S. FDA approves Parkinson's treatment from Sunovion Pharma

* [ Score = 0.447 ] Millions of African children rely on TV education during pandemic - Reuters India

* [ Score = 0.447 ] China’s Confucius Institutes Confronting U.S. Demand to Register

* [ Score = 0.316 ] Explainer - How the coronavirus crisis is affecting food supply

* [ Score = 0.316 ] Harvey Weinstein hit with third sexual assault case in Los Angeles

* [ Score = 0.316 ] Silence no more: Paris airport's neighbours brace as flights resume

* [ Score = 0.316 ] Zidane criticises Madrid attitude in surprise Cadiz defeat - Reuters India

* [ Score = 0.316 ] U.N. agency says humanitarian corridor keeps Africa aid flowing amid coronavirus

* [ Score = 0.316 ] In sign of the times, Ayn Rand Institute approved for PPP loan - Reuters

* [ Score = 0.316 ] What NFL MVP Lamar Jackson is doing to stay mentally tough during self-quarantine

--------------NEWS.TITLE---------------------------------------------
TF

100%|████████████████████████████████████████████████████████████████████████| 11940/11940 [00:00<00:00, 187927.28it/s]


* [ Score = 0.316 ] Explainer - How the coronavirus crisis is affecting food supply

* [ Score = 0.316 ] Mexico's TV Azteca invests in music streaming company Deezer - Reuters

* [ Score = 0.316 ] Thai police arrest student leader over anti-government protest

* [ Score = 0.316 ] Why Hollywood is sticking with movie theaters and only a few films are heading to streaming

* [ Score = 0.316 ] Brown appointed to second stint as Highlanders' head coach - Reuters

* [ Score = 0.316 ] Trump administration isn't going far enough to help student loan borrowers, critics say

* [ Score = 0.316 ] Team Nigma coach rmN- to stand in for KuroKy (arm) - Reuters

* [ Score = 0.316 ] UPDATE 1-Oil Search to write off some PNG exploration assets, power project - Reuters Africa

* [ Score = 0.316 ] Germany encouraged by Pfizer COVID-19 vaccine news, cautious on timeline - Reuters India

* [ Score = 0.316 ] Yemen could face 'catastrophic' food situation as pandemic worsens -FAO

############################

100%|████████████████████████████████████████████████████████████████████████| 18278/18278 [00:00<00:00, 195784.39it/s]


* [ Score = 0.632 ] Singapore charges Citadelle director with falsifying letters related to Wirecard - Reuters

* [ Score = 0.447 ] With shift toward merit-based pay, Japan's Hitachi to drop old ways

* [ Score = 0.447 ] S.Korea stocks fall as raft of negative drivers weigh - Reuters

* [ Score = 0.447 ] BRIEF-Jointown Pharmaceutical Group Gets Approval For Preference Share Issue

* [ Score = 0.447 ] Honda goes small with first all-electric car - Reuters India

* [ Score = 0.447 ] Senate Democrats block coronavirus relief bill

* [ Score = 0.447 ] Ant Group delivers timely boost to Hong Kong listing market

* [ Score = 0.447 ] Some CEOs decline White House dinner for Mexican president amid coronavirus surge - Reuters India

* [ Score = 0.447 ] Google in talks to invest $4 billion in Reliance's digital arm, Bloomberg reports - Reuters UK

* [ Score = 0.447 ] Petrofac warns of project delays on COVID-19, lower oil price hit

--------------NEWS.TITLE---------------------------------------

100%|████████████████████████████████████████████████████████████████████████| 11940/11940 [00:00<00:00, 175887.13it/s]

* [ Score = 0.577 ] Danske Bank says Corporate & Institutions boss Groot to leave - Reuters

* [ Score = 0.577 ] Outlook for European corporate profits worsens further - Reuters UK

* [ Score = 0.577 ] Emerging market corporate credit quality down but not out

* [ Score = 0.577 ] UPDATE 1-China to announce curbs on U.S. media - Global Times editor - Reuters

* [ Score = 0.577 ] U.S. Stock Market Looks Overvalued to Corporate Finance Chiefs

* [ Score = 0.577 ] MIDEAST STOCKS-Oil prices, corporate earnings boost Saudi shares

* [ Score = 0.577 ] BRIEF-Entera Bio Ltd Provides Update On Phase 2 Clinical Trial Of Eb613 In Postmenopausal Women With Osteoporosis - Reuters

* [ Score = 0.577 ] Bumper corporate taxes shelter Irish finances from worst of COVID-19 - Reuters

* [ Score = 0.577 ] Japan revises down Q1 corporate capex to 0.1% yr/yr increase - MOF - Reuters

* [ Score = 0.577 ] BRIEF-Hub International Acquires Assets Of Texas-Based GBC Benefits, Ltd., D/B/A Gus Bates Insurance & Inv


