# <div class="alert alert-info"> 2. Setup </div>

In [1]:
#!pip install networkx
#!pip install datapane
#!pip install operator
#!pip install pyvis
#!pip install streamlit pyvis networkx
#!pip install langdetect

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import zipfile
import json
import urllib
import langdetect

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')

import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 


import networkx as nx
from networkx.algorithms import community #This part of networkx, for community detection, needs to be imported separately
import datapane as dp
#from operator import itemgetter
from pyvis.network import Network

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mai\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


---
# <div class="alert alert-info"> 3. Data Preparation </div>

 ## <font color=red>**News category dataset**

The dataset used is “News category dataset” from Kaggle (https://www.kaggle.com/rmisra/news-category-dataset. This dataset provided with around 200k news headlines from the year 2012 to 2018 obtained from HuffPost.

## <font color=red>Load data
    
The dataset is contained into a json file, so I will first read it into a list of dictionaries with json and then transform it into a pandas Dataframe.

In [2]:
df = pd.read_csv('../data/processed/News_Category.csv')
print(df.shape)
## print 5 random rows
df = df.reset_index(drop=True)
df.head(5)

(26768, 6)


Unnamed: 0,category,headline,short_description,text,lang,text_clean
0,BUSINESS,"U.S. Launches Auto Import Probe, China Vows To...",The investigation could lead to new U.S. tarif...,"U.S. Launches Auto Import Probe, China Vows To...",en,u launch auto import probe china vow defend in...
1,BUSINESS,Starbucks Says Anyone Can Now Sit In Its Cafes...,The new policy was unveiled weeks after the co...,Starbucks Says Anyone Can Now Sit In Its Cafes...,en,starbuck say anyon sit cafe even without buy a...
2,BUSINESS,Seattle Passes Controversial New Tax On City's...,"Following the council vote, Amazon’s vice pres...",Seattle Passes Controversial New Tax On City's...,en,seattl pas controversi new tax citi biggest co...
3,BUSINESS,Uber Ends Forced Arbitration In Individual Cas...,Victims will be free to go to court -- but a f...,Uber Ends Forced Arbitration In Individual Cas...,en,uber end forc arbitr individu case sexual assa...
4,BUSINESS,"Chili's Hit By Data Breach, Credit And Debit C...",The breach is believed to have occurred betwee...,"Chili's Hit By Data Breach, Credit And Debit C...",en,chili hit data breach credit debit card inform...


In [3]:
df.category.value_counts()

TRAVEL          9855
FOOD & DRINK    6208
BUSINESS        5878
SPORTS          4827
Name: category, dtype: int64

In [4]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=False, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and   characters and then strip)
    #text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    #text = re.sub('[^a-zA-Z\s]', '', text)
    text = str(text).lower().strip()
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [5]:
lst_stopwords = nltk.corpus.stopwords.words("english")

df['text_clean'] = df["short_description"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=False, lst_stopwords=None))

df.head()

Unnamed: 0,category,headline,short_description,text,lang,text_clean
0,BUSINESS,"U.S. Launches Auto Import Probe, China Vows To...",The investigation could lead to new U.S. tarif...,"U.S. Launches Auto Import Probe, China Vows To...",en,the investigation could lead to new u.s. tarif...
1,BUSINESS,Starbucks Says Anyone Can Now Sit In Its Cafes...,The new policy was unveiled weeks after the co...,Starbucks Says Anyone Can Now Sit In Its Cafes...,en,the new policy was unveiled weeks after the co...
2,BUSINESS,Seattle Passes Controversial New Tax On City's...,"Following the council vote, Amazon’s vice pres...",Seattle Passes Controversial New Tax On City's...,en,"following the council vote, amazon’s vice pres..."
3,BUSINESS,Uber Ends Forced Arbitration In Individual Cas...,Victims will be free to go to court -- but a f...,Uber Ends Forced Arbitration In Individual Cas...,en,victims will be free to go to court -- but a f...
4,BUSINESS,"Chili's Hit By Data Breach, Credit And Debit C...",The breach is believed to have occurred betwee...,"Chili's Hit By Data Breach, Credit And Debit C...",en,the breach is believed to have occurred betwee...


In [24]:
df = df[df['category'] == 'TRAVEL']

## <font color=red>Split text to sentences

In [25]:
df = df.reset_index(drop=True)
sent_list = []
id_list = []
category_list = []

for i in range(0, len(df)):
    doc = nlp(df.text_clean.iloc[i])
    for sent in doc.sents:
        id_list.append(i)
        sent = "'" + str(sent) + "'"
        sent_list.append(sent)
        category_list.append(df.category.iloc[i])
        
sent_df = pd.DataFrame()
#sent_df['id'] = id_list
sent_df['text_clean'] = sent_list
print(sent_df.shape)

(16747, 1)


In [30]:
#sent_df = pd.read_csv('Category_News_sent_df.csv')
sent_df = pd.DataFrame()
#sent_df['id'] = id_list
sent_df['text_clean'] = sent_list
print(sent_df.shape)

(16747, 1)


In [31]:
sent_df.head()

Unnamed: 0,text_clean
0,'having waterproof covers on the seats is kind...
1,"'charming towns, relaxing beaches and top hiki..."
2,'star wars: galaxy's edge will open at disneyl...
3,'these underrated travel destinations in europ...
4,'if you’re dreaming about a romantic european ...


---
# <div class="alert alert-info"> 4. Information Retrieval </div>

### Find documents contains words

In [85]:
sent_df.text_clean = sent_df.text_clean.astype(str) 


sdf = sent_df[sent_df['text_clean'].str.contains("london")]
print(sdf.shape)
sdf = sdf.reset_index()

for i in range(0,6):
    print("\n", sdf.text_clean.iloc[i])

(156, 1)

 'if only we could apparate ourselves to london.'

 'whenever jackie and i are planning a trip to london (which is as often as possible), we check to see what’s playing at the'

 'doug's trip to london was paws-itively precious.'

 'eye see you the baku eye is azerbaijan’s version of the london eye.'

 'london is a tourist’s paradise owing to the several iconic attractions it has to offer.therefore,it makes sense why london'

 'london is giving new meaning to slip and slide.'


In [86]:
sentence = 'singapore is currently the most competitive city in the world, beating out new york and london, according to the economist'
sentence = 'challengers to silicon valley include new york, l.a., boston, tel aviv, and london.'
sentence = 'ed young is the senior pastor of fellowship church, which is headquartered in grapevine, texas but has rapidly expanded across texas, to florida, london (uk) and online.'
sentence = 'she earned her bfa at london college of communication, uk'
sentence = 'london -- a classic mercedes-benz race car driven by formula 1 legend juan manuel fangio sold for 19.6 million pounds ($29.6'
sentence  = 'prices soared as people scrambled to flee the london bridge attack'
sentence= "at a london branch of britain's biggest retailer, tesco , which found horse dna in some of its own-brand frozen spaghetti"
sentence = 'maria perez is the co-founder and product manager of glassful'
sentence = 'london is a tourist’s paradise owing to the several iconic attractions it has to offer.therefore,it makes sense why london'
sentence = "investigators in washington and london last month struck a $450 million settlement with barclays in a rate-rigging case, but"
sentence = 'according to a newly released report, the united states is predicted to win the most medals at the london 2012 olympics.'

# Make your Doc object and pass it into the scorer:
doc = nlp(sentence)

# For practice, visualize your fine-grained POS tags (shown in the third column):
print(f"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION FINE':{50}} {'DEPENDENCY':{6}} {'DESCRIPTION DEPENDENCY'}")
print(f"{'-----':{10}} {'------':{8}} {'----':{6}} {'----------------':{50}} {'----------':{6}} {'----------------------'}")

for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_):{50}} {token.dep_:{8}} {spacy.explain(token.dep_)}')

TOKEN      COARSE   FINE   DESCRIPTION FINE                                   DEPENDENCY DESCRIPTION DEPENDENCY
-----      ------   ----   ----------------                                   ---------- ----------------------
according  VERB     VBG    verb, gerund or present participle                 prep     prepositional modifier
to         ADP      IN     conjunction, subordinating or preposition          prep     prepositional modifier
a          DET      DT     determiner                                         det      determiner
newly      ADV      RB     adverb                                             advmod   adverbial modifier
released   VERB     VBN    verb, past participle                              amod     adjectival modifier
report     NOUN     NN     noun, singular or mass                             pobj     object of preposition
,          PUNCT    ,      punctuation mark, comma                            punct    punctuation
the        DET      DT     determiner

In [87]:
# extract pattern: adj + N
def extract_pattern1(doc):
    
    adj = ""
    obj1 = []
    obj2 = []
    for i, tok in enumerate(doc):
        #print(i, ": ", tok.text, "-->",tok.dep_,"-->", tok.pos_)
        
        # structure adj + Noun: capture adj or gerund or Participant as adj ()
        # if((tok.dep_.endswith("acomp") == False) & (tok.pos_.endswith("ADJ") == True)):
        if(tok.pos_.endswith("ADJ") == True):
            adj = tok.text
        elif((tok.dep_.endswith("amod") == True) & (tok.pos_.endswith("VERB") == True)):
            adj = tok.text
            
            
        if((tok.pos_.endswith("NOUN")==True) | (tok.pos_.endswith("PROPN") == True)):
            if(tok.dep_.endswith("compound") == False):
                entity = tok.text
                if(len(adj) > 0):
                    obj1.append(entity)
                    obj2.append(adj)
                adj = ""

    
    return obj1, obj2
            
clause = 'location was great, just a few minute walk to supermarket'
obj1, obj2 = extract_pattern1(nlp(sentence))
print(obj1)
print(obj2)

['report', 'medals']
['released', 'most']


In [88]:
# extract pattern: N + adj

def extract_pattern2(doc):
    
    entity = ""
    obj1 = []
    obj2 = []
    flag = 0
    for i, tok in enumerate(doc):
        # print(i, ": ", tok.text, "-->",tok.dep_,"-->", tok.pos_)
            
        # extract subject/ root
        if((tok.pos_.endswith("NOUN")==True) | (tok.pos_.endswith("PROPN") == True)):
            if((tok.dep_.endswith("ROOT") == True) | (tok.dep_.endswith("appos") == True) | (tok.dep_.find("subj") == True)):
                entity = tok.text
            else:
                entity = ""
                
        if((tok.pos_.endswith("AUX")==True)):
            flag = 1
                
        # structure adj + Noun: capture adj or gerund or Participant as adj ()
        # if((tok.dep_.endswith("acomp") == False) & (tok.pos_.endswith("ADJ") == True)):
        if(tok.pos_.endswith("ADJ") == True):
            adj = tok.text
            if((len(entity) > 0) & (flag==1)):
                obj1.append(entity)
                obj2.append(adj)
            entity = ""
            
    
    return obj1, obj2

clause = 'Great location for shops, restaurants and access to public transport to explore London'
obj1, obj2 = extract_pattern2(nlp(sentence))
print(obj1)
print(obj2)

['states']
['most']


In [89]:
# extract pattern: Sbj + N + N +..+N

def extract_pattern3(doc):
    
    entity1 = ''
    entity2 = ''
    obj1 = []
    obj2 = []
    flag = 0
    for i, tok in enumerate(doc):
        # print(i, ": ", tok.text, "-->",tok.dep_,"-->", tok.pos_)
            
        
        if(tok.dep_.endswith("compound") == False):
            # extract subject/ root
            if((tok.pos_.endswith("NOUN")==True) | (tok.pos_.endswith("PROPN") == True) ):
                #print(tok.text, tok.dep_)
                #if((tok.dep_.endswith("ROOT") == True) | (tok.dep_.endswith("appos") == True) | (tok.dep_.find("subj") == True)):
                if((tok.dep_.endswith("ROOT") == True) | (tok.dep_.endswith("subj") == True) | (tok.dep_.endswith("nsubjpass") == True)):
                    entity1 = tok.text
                elif((tok.dep_.endswith('poss') == True) | (tok.dep_.endswith('attr') == True) | (tok.dep_.endswith('pobj') == True) | (tok.dep_.endswith('dobj') == True)| (tok.dep_.endswith('conj') == True)):
                        entity2 = tok.text
        
        if((len(entity1) > 0) & (len(entity2) > 0)):
            #print(entity1, entity2)
            obj1.append(entity1)
            obj2.append(entity2)
            entity2= ''
        '''     
        elif((len(entity1) == 0) & (len(entity2) > 0)):
            obj1.append('N/A')
            obj2.append(entity2)
            entity2= ''
        '''
            
            
    
    return obj1, obj2

clause = 'location was great, just a few minute walk to supermarket'
obj1, obj2 = extract_pattern3(nlp(sentence))
print(obj1)
print(obj2)

['states', 'states', 'states']
['report', 'medals', 'olympics']


In [90]:
# extract compound 

def extract_pattern4(doc):
    adj = ""
    obj1 = []
    obj2 = []
    for i, tok in enumerate(doc):
        if((tok.pos_.endswith("NOUN")==True) | (tok.pos_.endswith("PROPN") == True)):
            if(tok.dep_.endswith("compound") == True):
                adj = tok.text
            
            
        if((tok.pos_.endswith("NOUN")==True) | (tok.pos_.endswith("PROPN") == True)):
            if(tok.dep_.endswith("compound") == False):
                entity = tok.text
                if(len(adj) > 0):
                    obj1.append(adj)
                    obj2.append(entity)
                adj = ""

    
    return obj1, obj2

obj1, obj2 = extract_pattern4(nlp(sentence))
print(obj1)
print(obj2)

['united']
['states']


#### combine patterns

In [91]:
def extract_info(clause):
    list_obj1 = []
    list_obj2 = []
    obj1, obj2 = extract_pattern1(nlp(clause))
    list_obj1 = list_obj1 + obj1
    list_obj2 = list_obj2 + obj2
    obj1, obj2 = extract_pattern2(nlp(clause))
    list_obj1 = list_obj1 + obj1
    list_obj2 = list_obj2 + obj2
    obj1, obj2 = extract_pattern3(nlp(clause))
    list_obj1 = list_obj1 + obj1
    list_obj2 = list_obj2 + obj2
    obj1, obj2 = extract_pattern4(nlp(clause))
    list_obj1 = list_obj1 + obj1
    list_obj2 = list_obj2 + obj2
    
    return list_obj1, list_obj2

clause = 'location was good for me , off street parking'
list_obj1, list_obj2 = extract_info(sentence)
list_obj1, list_obj2

(['report', 'medals', 'states', 'states', 'states', 'states', 'united'],
 ['released', 'most', 'most', 'report', 'medals', 'olympics', 'states'])

In [92]:
list_obj1 = []
list_obj2 = []

#for i in range(0, len(cluster_df)):
for i in range(0, len(sdf)):
    sent = sdf.text_clean.iloc[i]
    #print(sent)
    #sent = utils_preprocess_text(sent)
    node1, node2 = extract_info(sent)
    #print(node1)
    #print(node2)
    list_obj1 = list_obj1 + node1
    list_obj2 = list_obj2 + node2
        
#print(house_obj1)
#print(house_obj2)

#### create data frame to contain nodes of graph

In [93]:
graph_df = pd.DataFrame()
graph_df['obj1'] = list_obj1
graph_df['obj2'] = list_obj2
graph_df['ID'] = range(0,len(graph_df))
#graph_df = graph_df[graph_df.obj2 != 'NA']
print(graph_df.shape)
graph_df.head()

(745, 3)


Unnamed: 0,obj1,obj2,ID
0,jackie,trip,0
1,jackie,london,1
2,trip,doug,2
3,trip,london,3
4,trip,paws,4


In [94]:
weight_graph_df = pd.DataFrame(graph_df.groupby(['obj1','obj2']).count()).reset_index()
weight_graph_df.rename(columns={'ID':'weight'}, inplace=True)
weight_graph_df = weight_graph_df.drop_duplicates(subset=['obj1','obj2'], keep='last')
weight_graph_df.sort_values(['weight'],ascending=False)

Unnamed: 0,obj1,obj2,weight
455,new,york,6
613,summer,olympics,4
410,london,world,4
366,london,olympics,3
331,london,city,3
...,...,...,...
235,fountain,london,1
236,fountain,rome,1
237,fountain,trevi,1
238,francisco,brand,1


# Network Analysis (with NetworkX)

In [95]:
# Generate a networkx graph
G = nx.from_pandas_edgelist(weight_graph_df, 'obj1', 'obj2')

# Give the graph a name
G.name = 'Hotel Interactions Network'

# Check whether graph is directed or undirected (False = undirected)
print(G.is_directed())

# Obtain general information of graph
print(nx.info(G))

# Get graph density
density = nx.density(G)
print("Network density:", density)

False
Name: Hotel Interactions Network
Type: Graph
Number of nodes: 626
Number of edges: 697
Average degree:   2.2268
Network density: 0.0035629392971246007


In [96]:
# Get most connected node (i.e. drug with most drug interactions)
G.degree()
max(dict(G.degree()).items(), key = lambda x : x[1])

('london', 129)

# Network Visualization (with Pyvis)

https://towardsdatascience.com/customizing-networkx-graphs-f80b4e69bedf

https://www.cl.cam.ac.uk/teaching/1314/L109/tutorial.pdf


https://www.toptal.com/data-science/graph-data-science-python-networkx



In [97]:
# Define function to generate Pyvis visualization
def generate_network_viz(df, source_col, target_col, weights, 
                         layout='barnes_hut',
                         central_gravity=0.15,
                         node_distance=420,
                         spring_length=100,
                         spring_strength=0.15,
                         damping=0.96,
                         minium_weight: int = 0,
                         ):
    
    # Generate a networkx graph
    G = nx.from_pandas_edgelist(df, source_col, target_col, weights)
    
    if layout == 'repulsion':
        bgcolor, font_color = '#222222', 'white'
    else:
        bgcolor, font_color = 'white', 'black'
    
    # Initiate PyVis network object
    drug_net = Network(
                       height='700px', 
                       width='100%',
                       bgcolor=bgcolor, 
                       font_color=font_color, 
                       notebook=True
                      )
    
    # Take Networkx graph and translate it to a PyVis graph format
    drug_net.from_nx(G)
    
    # Create different network layout (repulsion or Barnes Hut)
    if layout == 'repulsion':
        drug_net.repulsion(
                            node_distance=node_distance, 
                            central_gravity=central_gravity, 
                            spring_length=spring_length, 
                            spring_strength=spring_strength, 
                            damping=damping
                           )
        
    # Run default Barnes Hut visualization
    else:
        drug_net.barnes_hut(
#                            gravity=-80000, 
#                            central_gravity=central_gravity, 
#                            spring_length=spring_length, 
#                            spring_strength=spring_strength, 
#                            damping=damping, 
#                            overlap=0
                          )      
    return drug_net

### Barnes Hut Visualization
BarnesHut is a quadtree based gravity model
It is the fastest, default and recommended solver for non-hierarchical layouts

In [110]:
selected_nodes = []
for e in dict(G.degree()).items():
    node, degree = e
    if ((degree < 100) & (degree > 5)):
    # if (node == 'bridge'):
        print(node, degree)
        selected_nodes.append(node)

mouthfuls 6
airport 7
fan 10
passengers 7
royal 7
boom 6
hotels 7
building 6
capital 6
chances 6
york 11
charles 6
city 14
paris 7
time 6
place 7
craig 6
day 11
trip 10
shopping 6
world 9
eros 7
events 6
olympics 8
top 8
eyes 12
folks 7
best 10
hundreds 6
knightsbridge 8
library 6
list 6
pubs 6
travel 6
neighborhood 11
plan 8
overs 6
stories 9
summer 6
way 10


In [111]:
#selected_nodes = ['location','italian', 'prime']
# Create network for single drug. Use Phenytoin since it has most edges (i.e. involved in most drug interactions)
small_nw = weight_graph_df.loc[weight_graph_df['obj1'].isin(selected_nodes) | weight_graph_df['obj2'].isin(selected_nodes)]

node_color = {'NOUN':'lightblue', 'PRON':'lightblue', 'PROPN':'yellow', 'VERB': 'red', 'ADJ':'red', 'ADV':'red', 'DET':'red', 'X':'grey', 'INTJ':'grey',
              'AUX':'grey','NUM':'grey','SPACE':'grey','PUNCT':'grey','SCONJ':'grey','ADP':'grey'}
# Generate a networkx graph based on subset data
net_repulsion = generate_network_viz(small_nw, 'obj1','obj2', 'weight', layout='repulsion')

node_list = []
for i in range(0, len(net_repulsion.nodes)):
    node_list.append(net_repulsion.nodes[i]['id'])
type_node = [nlp(x)[0].pos_ for x in node_list ]

for i in range(0, len(net_repulsion.nodes)):
    net_repulsion.nodes[i]['color'] = node_color[type_node[i]]

net_repulsion.show('hotel_interactions_network_room.html')

# Run the above code chunk in order to display the graph visualization below