In [155]:
import os
import re
import json
import anew
import string
import nltk
import spacy
import pandas as pd
from nltk.corpus import stopwords
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Phrases

In [156]:
pd.set_option('display.max_colwidth', -1)
stop_words_eng = set(stopwords.words('english'))
te = TransactionEncoder()
scaler = MinMaxScaler(feature_range=(1,10))
nlp = spacy.load('en_core_web_sm')

In [157]:
#input_doc_path = "./data/tweets_nestle_2016-07-23_2018-07-23.json"
input_doc_path = "./data/tweets_sample.json"

with open(input_doc_path,"r") as f:
    if input_doc_path.endswith(".json"):
        doc = [json.loads(line) for line in f]
    elif input_doc_path.endswith(".csv"):
        doc = pd.read_csv(f, delimiter=',')

In [158]:
doc = pd.DataFrame(doc)

In [159]:
# rename column names to standardize data schema
doc = doc.rename(columns={"id_reference_num":"id","delivery":"source","username":"source","tweet":"text"})

In [160]:
#doc[["tweet","mentions"]].head(10)
# only select key columns and ignore rest of the dataset
doc = doc[["id","source","date","text"]]
doc.head()

Unnamed: 0,id,source,date,text
0,757000564941160448,UrsulaPflug,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216672624t
1,756998556729348096,nhCoco,2016-07-23,"@ConagraFoods @pepsi Tell @Pepsi @GeneralMills @KelloggsUS @ConAgraFoods @Nestle you want on-package GMO labels, not QR codes | @ewg | http://bit.ly/29KHaqC"
2,756997874760749058,samanthashahi,2016-07-23,"Hey @Nestle - California is in a drought and you're using our water, can you stop? Oh, we're on fire too pic.twitter.com/O6jdQFHKUv"
3,756997317086044160,DRCSBoivin,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216671109t
4,756997063855013892,funkyourmother,2016-07-23,I like the notion that everyone in Europe started coping with Brexit by eating @Nestle chocolate pic.twitter.com/7YUYPVIJZK


In [161]:
"""
def extract_hashtag(x):
    ht_ls = re.findall(r"#(\w+)", x)     # find hashtag
    return list(set(ht_ls))

doc["hashtags"] = doc["text"].apply(lambda x: extract_hashtag(x))
"""

'\ndef extract_hashtag(x):\n    ht_ls = re.findall(r"#(\\w+)", x)     # find hashtag\n    return list(set(ht_ls))\n\ndoc["hashtags"] = doc["text"].apply(lambda x: extract_hashtag(x))\n'

In [162]:
exclude_punctuation = set([p for p in string.punctuation if p not in ["!",",",".","?",";"]])
# remove mentions, url, hashtag
def strip_text(t):
    #t = re.sub(r'\.?@\w+\:?',"",t)
    t = re.sub(r'\d+','',t)
    t = re.sub('https?://.+',"",t)             # url
    t = re.sub('pic\.twitter\.com.+',"",t)     # url
    t = re.sub("[#]", " ", t)                  # remove hashtag
    #t = t.lower()                              # lower case
    return ''.join(ch for ch in t if ch not in exclude_punctuation)   # remove unnessary punctuation

doc["text_clean"] = doc["text"].apply(lambda x: strip_text(x))

In [163]:
doc.head()

Unnamed: 0,id,source,date,text,text_clean
0,757000564941160448,UrsulaPflug,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216672624t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water
1,756998556729348096,nhCoco,2016-07-23,"@ConagraFoods @pepsi Tell @Pepsi @GeneralMills @KelloggsUS @ConAgraFoods @Nestle you want on-package GMO labels, not QR codes | @ewg | http://bit.ly/29KHaqC","ConagraFoods pepsi Tell Pepsi GeneralMills KelloggsUS ConAgraFoods Nestle you want onpackage GMO labels, not QR codes ewg"
2,756997874760749058,samanthashahi,2016-07-23,"Hey @Nestle - California is in a drought and you're using our water, can you stop? Oh, we're on fire too pic.twitter.com/O6jdQFHKUv","Hey Nestle California is in a drought and youre using our water, can you stop? Oh, were on fire too"
3,756997317086044160,DRCSBoivin,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216671109t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water
4,756997063855013892,funkyourmother,2016-07-23,I like the notion that everyone in Europe started coping with Brexit by eating @Nestle chocolate pic.twitter.com/7YUYPVIJZK,I like the notion that everyone in Europe started coping with Brexit by eating Nestle chocolate


In [164]:
"""
# SpaCy entity labels
PERSON	People, including fictional.
NORP	Nationalities or religious or political groups.
FAC	Buildings, airports, highways, bridges, etc.
ORG	Companies, agencies, institutions, etc.
GPE	Countries, cities, states.
LOC	Non-GPE locations, mountain ranges, bodies of water.
PRODUCT	Objects, vehicles, foods, etc. (Not services.)
EVENT	Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART	Titles of books, songs, etc.
LAW	Named documents made into laws.
LANGUAGE	Any named language.
DATE	Absolute or relative dates or periods.
TIME	Times smaller than a day.
PERCENT	Percentage, including "%".
MONEY	Monetary values, including unit.
QUANTITY	Measurements, as of weight or distance.
ORDINAL	"first", "second", etc.
CARDINAL	Numerals that do not fall under another type.
"""

'\n# SpaCy entity labels\nPERSON\tPeople, including fictional.\nNORP\tNationalities or religious or political groups.\nFAC\tBuildings, airports, highways, bridges, etc.\nORG\tCompanies, agencies, institutions, etc.\nGPE\tCountries, cities, states.\nLOC\tNon-GPE locations, mountain ranges, bodies of water.\nPRODUCT\tObjects, vehicles, foods, etc. (Not services.)\nEVENT\tNamed hurricanes, battles, wars, sports events, etc.\nWORK_OF_ART\tTitles of books, songs, etc.\nLAW\tNamed documents made into laws.\nLANGUAGE\tAny named language.\nDATE\tAbsolute or relative dates or periods.\nTIME\tTimes smaller than a day.\nPERCENT\tPercentage, including "%".\nMONEY\tMonetary values, including unit.\nQUANTITY\tMeasurements, as of weight or distance.\nORDINAL\t"first", "second", etc.\nCARDINAL\tNumerals that do not fall under another type.\n'

In [165]:
# extract entities
LABEL_INCLUDED = ["PERSON","NORP","ORG","GPE","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE"]
#LABEL_INCLUDED = ["GPE"]
def extract_NER(x):
    doc = nlp(str(x))
    return [(ent.text,ent.label_) for ent in doc.ents if ent.label_ in LABEL_INCLUDED and ent.text.strip() != ""]

doc["entities"] = doc["text_clean"].apply(lambda x: extract_NER(x))

In [166]:
doc.head()

Unnamed: 0,id,source,date,text,text_clean,entities
0,757000564941160448,UrsulaPflug,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216672624t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water,[]
1,756998556729348096,nhCoco,2016-07-23,"@ConagraFoods @pepsi Tell @Pepsi @GeneralMills @KelloggsUS @ConAgraFoods @Nestle you want on-package GMO labels, not QR codes | @ewg | http://bit.ly/29KHaqC","ConagraFoods pepsi Tell Pepsi GeneralMills KelloggsUS ConAgraFoods Nestle you want onpackage GMO labels, not QR codes ewg","[(ConagraFoods, ORG), (Tell, ORG), (Pepsi, ORG), (GeneralMills, ORG), (Nestle, ORG), (QR, ORG)]"
2,756997874760749058,samanthashahi,2016-07-23,"Hey @Nestle - California is in a drought and you're using our water, can you stop? Oh, we're on fire too pic.twitter.com/O6jdQFHKUv","Hey Nestle California is in a drought and youre using our water, can you stop? Oh, were on fire too","[(California, GPE)]"
3,756997317086044160,DRCSBoivin,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216671109t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water,[]
4,756997063855013892,funkyourmother,2016-07-23,I like the notion that everyone in Europe started coping with Brexit by eating @Nestle chocolate pic.twitter.com/7YUYPVIJZK,I like the notion that everyone in Europe started coping with Brexit by eating Nestle chocolate,"[(Brexit, ORG), (Nestle, ORG)]"


In [167]:
doc.dtypes

id            int64 
source        object
date          object
text          object
text_clean    object
entities      object
dtype: object

In [168]:
"""
from gensim.models import Phrases
documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]

sentence_stream = [doc.split(" ") for doc in documents]
bigram = Phrases(sentence_stream, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])
"""

'\nfrom gensim.models import Phrases\ndocuments = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]\n\nsentence_stream = [doc.split(" ") for doc in documents]\nbigram = Phrases(sentence_stream, min_count=1, threshold=2)\nsent = [u\'the\', u\'mayor\', u\'of\', u\'new\', u\'york\', u\'was\', u\'there\']\nprint(bigram[sent])\n'

In [169]:
# combine key entities, mentions and hashtag for affinity analysis
def entity_combined(row):
    #ents = ["@"+x.strip() for x in row["mentions"].split(",")]
    #ents = ["#"+x.strip() for x in row["hashtags"]]
    ents = []
    ents.append(row["source"])
    for e in row["entities"]:
        ents.append(e[0].strip())
    return list(set(ents))

doc["entity_combined"] = doc.apply(entity_combined,axis=1)

In [170]:
doc.head()

Unnamed: 0,id,source,date,text,text_clean,entities,entity_combined
0,757000564941160448,UrsulaPflug,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216672624t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water,[],[UrsulaPflug]
1,756998556729348096,nhCoco,2016-07-23,"@ConagraFoods @pepsi Tell @Pepsi @GeneralMills @KelloggsUS @ConAgraFoods @Nestle you want on-package GMO labels, not QR codes | @ewg | http://bit.ly/29KHaqC","ConagraFoods pepsi Tell Pepsi GeneralMills KelloggsUS ConAgraFoods Nestle you want onpackage GMO labels, not QR codes ewg","[(ConagraFoods, ORG), (Tell, ORG), (Pepsi, ORG), (GeneralMills, ORG), (Nestle, ORG), (QR, ORG)]","[Nestle, QR, Tell, Pepsi, nhCoco, GeneralMills, ConagraFoods]"
2,756997874760749058,samanthashahi,2016-07-23,"Hey @Nestle - California is in a drought and you're using our water, can you stop? Oh, we're on fire too pic.twitter.com/O6jdQFHKUv","Hey Nestle California is in a drought and youre using our water, can you stop? Oh, were on fire too","[(California, GPE)]","[California, samanthashahi]"
3,756997317086044160,DRCSBoivin,2016-07-23,.@Kathleen_Wynne: Stop letting @Nestlé suck Ontario dry for $3.71 per million litres of water http://d.shpg.org/216671109t,.KathleenWynne Stop letting Nestlé suck Ontario dry for . per million litres of water,[],[DRCSBoivin]
4,756997063855013892,funkyourmother,2016-07-23,I like the notion that everyone in Europe started coping with Brexit by eating @Nestle chocolate pic.twitter.com/7YUYPVIJZK,I like the notion that everyone in Europe started coping with Brexit by eating Nestle chocolate,"[(Brexit, ORG), (Nestle, ORG)]","[funkyourmother, Nestle, Brexit]"


In [171]:
len(doc) # number of tweets

17107

In [172]:
ent_list = list(doc["entity_combined"])

In [173]:
ent_list

[['UrsulaPflug'],
 ['Nestle', 'QR', 'Tell', 'Pepsi', 'nhCoco', 'GeneralMills', 'ConagraFoods'],
 ['California', 'samanthashahi'],
 ['DRCSBoivin'],
 ['funkyourmother', 'Nestle', 'Brexit'],
 ['louiseberube'],
 ['thewindwas'],
 ['patricklyver'],
 ['billycbolshevik'],
 ['INTERPOL', 'trutherbotwhite', 'Nestle'],
 ['vanessadewson'],
 ['coref64'],
 ['kylacopp'],
 ['KathleenWynne Stop Nestle', 'ONgov', 'CUPE129'],
 ['Nestle', 'MUFAFUK2patWag'],
 ['Nestlé', 'Fred Hahn', 'ape_biggles'],
 ['🍫', 'ChocAero Nestle NestleUKNews', 'Debskat2009'],
 ['ni_is_she', 'Strawberry Banana'],
 ['mnkhan88'],
 ['SOCIAL MEDIA MARKETING', 'Forbes Nestle', 'VERB', 'GOTLUCKIE_COM'],
 ['Nestle', 'QR', 'jdbrownhall77', 'Pepsi', 'ConagraFoods'],
 ['TerryRempelMroz'],
 ['Nestle', 'purdueturfy'],
 ['kiteie'],
 ['MC_Thompson'],
 ['mxdhny66', 'Nestle Nestlees NestleUSA'],
 ['Nestle', 'QR', 'apollomoniz', 'Pepsi', 'ConagraFoods'],
 ['GregCameron2'],
 ['ModernDescartes', 'MaisonCailler'],
 ['Nestle'],
 ['AmyDodington'],
 ['Ne

In [174]:
te_ary = te.fit(ent_list).transform(ent_list)

In [175]:
ent_df = pd.DataFrame(te_ary,columns=te.columns_)
ent_df

Unnamed: 0,.ARCSouthAfrica,.GeneralMills,.KathleenWynne,.Nestle,.Nestle Grand Bars,.Nestle S.A.,.SafewayCanada,.nestle,000_gf,007RrAaYy,...,🙈,🙈😈,🙌,🙌🏻,🙌🙌👏👏👏,🙏,🛇,🤔,🤔 Found,🤔🙈
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [176]:
ent_df.shape

(17107, 19298)

In [177]:
def frange(start, stop, step):
    i = start
    while i > stop:
        yield i
        i -= step

In [184]:
def apriori_w_adjusted_min_support(ent_df):
    full_range = list(frange(0.9,0.1,0.1)) + list(frange(0.09,0.01,0.01)) + list(frange(0.009,0.001,0.001))
    for i in full_range:
        threshold = round(i,3)
        frequent_ents = apriori(ent_df,min_support=threshold,use_colnames=True)
        if len(frequent_ents) > 20:
            print("min_support:{}".format(round(i,3)))
            return frequent_ents
    return frequent_ents

#frequent_ents = apriori(ent_df,min_support=0.7,use_colnames=True)
frequent_ents = apriori_w_adjusted_min_support(ent_df)

min_support:0.004


In [185]:
#frequent_ents = apriori(ent_df,min_support=0.01,use_colnames=True)
frequent_ents

Unnamed: 0,support,itemsets
0,0.06132,(.Nestle)
1,0.012509,(BATPress)
2,0.021629,(BATpress)
3,0.007541,(KITKAT)
4,0.004092,(KathleenWynne Stop)
5,0.144444,(Michigan)
6,0.387035,(Nestle)
7,0.052844,(Nestle M)
8,0.006196,(Nestle ModernDayMoms)
9,0.005144,(Nestles)


In [None]:
as_rules = association_rules(frequent_ents, metric="lift",min_threshold=1)

In [None]:
as_rules

In [None]:
key_rules = as_rules[(as_rules["confidence"]>0.7)&(as_rules["lift"]>=1)]
key_rules

In [None]:
# remove multiple items
def is_single(row):
    if row["antecedents"] is not None and len(row["antecedents"])==1:
        if row["consequents"] is not None and len(row["consequents"])==1:
            return "y"
    return "n"
key_rules["is_single"] = key_rules.apply(is_single,axis=1)

In [None]:
key_rules

In [None]:
#key_rules.drop(["is_single"],axis=1,inplace=True)
key_rules = key_rules[key_rules["is_single"]=="y"]
key_rules

In [None]:
key_rules["source"] = key_rules["antecedents"].apply(lambda x: ','.join(x))
key_rules["target"] = key_rules["consequents"].apply(lambda x: ','.join(x))

In [None]:
key_rules

## Create graph node response

In [None]:
"""
graph.node = [
    { ID:"@apple", group:1, index:0,name:"apple", px:100, py:100, size:40, weight:100, x:41, y:95},
    { ID:"@orange", group:2, index:1,name:"orange", px:50, py:50, size:10, weight:100, x:200, y:300},
    { ID:"@grape", group:1, index:1,name:"grape", px:200, py:200, size:3, weight:100, x:250, y:150}
  ];
  graph.edge = [
    { 
      source: 1, 
      target: 2,
      value:5
    }
  ];
"""  

In [None]:
antecedents_nodes = key_rules[["source","antecedent support"]]
antecedents_nodes.rename(columns={"source":"node","antecedent support":"support"},inplace=True)
consequents_nodes = key_rules[["target","consequent support"]]
consequents_nodes.rename(columns={"target":"node","consequent support":"support"},inplace=True)
graph_nodes = pd.concat([antecedents_nodes,consequents_nodes])
graph_nodes = graph_nodes.reset_index(drop=True)
graph_nodes

In [None]:
graph_nodes_dedup = graph_nodes.drop_duplicates(["node"],keep="first")
graph_nodes_dedup = graph_nodes_dedup.reset_index(drop=True)
graph_nodes_dedup

In [None]:
graph_nodes_dedup

In [None]:
  """
  { ID:"@apple", group:1, index:0,name:"apple", px:100, py:100, size:40, weight:100, x:41, y:95},
  { ID:"@orange", group:2, index:1,name:"orange", px:50, py:50, size:10, weight:100, x:200, y:300},
  { ID:"@grape", group:1, index:1,name:"grape", px:200, py:200, size:3, weight:100, x:250, y:150}
  """  
graph_nodes_dedup["ID"] = graph_nodes_dedup["node"]
graph_nodes_dedup["group"] = 2
graph_nodes_dedup["index"] = graph_nodes_dedup.index
graph_nodes_dedup["px"] = 100
graph_nodes_dedup["py"] = 100
graph_nodes_dedup["x"] = 100
graph_nodes_dedup["y"] = 100
graph_nodes_dedup["support"] = scaler.fit_transform(graph_nodes_dedup[["support"]])
graph_nodes_dedup["weight"] = 10
graph_nodes_dedup.rename(columns={"node":"name","support":"size"},inplace=True)
graph_nodes_dedup

In [None]:
graph_node_json = graph_nodes_dedup.to_json(orient='records')
graph_node_json

## Create graph link response

In [None]:
graph_edges = key_rules[["antecedents","consequents","lift"]]
graph_edges.rename(columns={"antecedents":"source","consequents":"target","lift":"value"},inplace=True)
graph_edges

In [None]:
"""
  source: 1, 
  target: 2,
  value:5
"""
graph_edges["source"] = graph_edges["source"].apply(lambda x: ','.join(x))
graph_edges["target"] = graph_edges["target"].apply(lambda x: ','.join(x))

In [None]:
graph_edges

In [None]:
# replace source name with source index
graph_edges_idx = graph_edges.merge(graph_nodes_dedup[["ID","index"]],how="left",left_on="source",right_on="ID")
graph_edges_idx.drop(["source","ID"],axis=1,inplace=True)
graph_edges_idx.rename(columns={"index":"source"},inplace=True)

In [None]:
# replace target name with target index
graph_edges_idx = graph_edges_idx.merge(graph_nodes_dedup[["ID","index"]],how="left",left_on="target",right_on="ID")
graph_edges_idx.drop(["target","ID"],axis=1,inplace=True)
graph_edges_idx.rename(columns={"index":"target"},inplace=True)
graph_edges_idx

In [None]:
graph_edge_json = graph_edges_idx.to_json(orient='records')
graph_edge_json