In [81]:
#!pip install gensim

In [10]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = '/Users/rissacao/downloads/'

In [11]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_word2vec = load_wordvec_model('Word2Vec', 'webhose_skipgram_300.bin', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [12]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [13]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [14]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

# Loads your previously obtained dataset of Webhose news articles 

In [15]:
import json 
    
#read data from json file
json_data=open("IBM_english.json").readlines()
#print(json_data)

#Output: print the number of articles, titles and publish dates of the first 100 articles.
feeds_read_from_file = []
for line in json_data:
    feeds_read_from_file.append(json.loads(line))
print(len(feeds_read_from_file))

#random select an article title from the dataset
print("Choose an article title: ", feeds_read_from_file[3813]['title'])

i=1
for feed in feeds_read_from_file:
    print(i, feed['title'])
    i=i+1
    

6915
Choose an article title:  Impact of AI and Automation in Chemistry
1 Two self-paced OpenShift Workshops for Developers
2 How Finding a Hobby Will Make You A Better Entrepreneur
3 Cognizant: Continued Strength Across Verticals, With 25% Stock Upside Likely In 2019-20 - Cognizant Technology Solutions Corporation (NASDAQ:CTSH)
4 Group Puzzles Out Silicon Specs
5 When AIs go to war: Autonomous cyber weapons ‘inevitable’
6 Tuesday Night
7 ROOMZ expands into the UK and Ireland
8 Digital skills are Africa’s ticket to prosperity
9 LevaData Announces 2019 Cognitive Sourcing Summit
10 Exeter Township woman began career in computer programming in 1962
11 Nutanix appoints new General Manager to oversee UK and Ireland
12 Why We Love Domain promo Code (And You Should, Too!)
13 Hedera Hashgraph Opens Public Network
14 Compute Express Link Consortium (CXL) Officially Incorporates; Announces Expanded Board of Directors
15 How to Personalize Digital Experiences in Finance (Paul Shumsky)
16 IBM DS89

In [16]:
similarity = []
i=1
for feed in feeds_read_from_file:
    
    try:
        output = calc_similarity(feeds_read_from_file[3813]['title'], 
                                  feed['title'], model_word2vec)
        #print(output)
        print(i, feed['title'], output)
        similarity.append([feed['title'], output])
        i=i+1
    except ZeroDivisionError as error:
        pass
        i=i+1


1 Two self-paced OpenShift Workshops for Developers 0.2929383
2 How Finding a Hobby Will Make You A Better Entrepreneur 0.350461
3 Cognizant: Continued Strength Across Verticals, With 25% Stock Upside Likely In 2019-20 - Cognizant Technology Solutions Corporation (NASDAQ:CTSH) 0.44865406
4 Group Puzzles Out Silicon Specs 0.39717126
5 When AIs go to war: Autonomous cyber weapons ‘inevitable’ 0.33342654
6 Tuesday Night 0.1494796
7 ROOMZ expands into the UK and Ireland 0.19404398
8 Digital skills are Africa’s ticket to prosperity 0.27285907
9 LevaData Announces 2019 Cognitive Sourcing Summit 0.41452163
10 Exeter Township woman began career in computer programming in 1962 0.19736603
11 Nutanix appoints new General Manager to oversee UK and Ireland 0.2259116
12 Why We Love Domain promo Code (And You Should, Too!) 0.35553664
13 Hedera Hashgraph Opens Public Network 0.3332153
14 Compute Express Link Consortium (CXL) Officially Incorporates; Announces Expanded Board of Directors 0.40183872
15 

In [17]:
#sort the list of article titiles based on Word2Vec similarity
similarity.sort(key = lambda x: x[1], reverse= True ) 
len(similarity)
#len(similarity[:99])

6754

In [18]:
#find 100 most similar titles based on Word2Vec similarity
#prints those titles in the reverse order of similarity scores.
rank=1
i=0
Top100_Similar=[]
for i in range(0,len(similarity)):
    #for item in similarity:
        i=i+1
        if rank<=100:
            try:
                print(rank,similarity[i-1])
                Top100_Similar.append([rank,similarity[i-1]])
                rank=rank+1

            except:
                pass 
            
len(Top100_Similar)
#print(Top100_Similar)

1 ['Impact of AI and Automation in Chemistry', 1.0]
2 ['IBM Reference Architecture for High Performance Data and AI in Healthcare and Life Sciences', 0.6730512]
3 ['IBM Reference Architecture for High Performance Data and AI in Healthcare and Life Sciences', 0.6730512]
4 ['Reverse tabnabbing vulnerability in IBM Business Automation Workflow and IBM Business Process Manager (BPM) (CVE-2019-4425)', 0.6439479]
5 ['Workflow Automation and Optimization Software Market Overview 2019-2025 by Top Manufacturers - Xerox Corporation, Oracle Corporation, IBM, SAS Institute', 0.6424461]
6 ['Marketing Automation Solutions Market Future Challenges Outlook 2024', 0.63867104]
7 ['Convergence of AI and AR: Transforming the Future of Business Applications - Developing AI+AR Solutions that Unlock New Business Opportunities', 0.6359062]
8 ['Microsoft Introduction to AI — Part 4 - Towards Data Science', 0.63587916]
9 ['IT Resilience Orchestration Automation (ITRO) Software Market 2019 Sales Channels, Produc

100

# Part 2: Write a Pyspark program

In [19]:
#Configure Spark Context
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext() 
sqlContext = SQLContext(sc)
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from nltk.stem.wordnet import WordNetLemmatizer
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, Word2Vec

print("Using Apache Spark Version", sc.version)

Using Apache Spark Version 2.4.4


In [20]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load( disable=['parser', 'tagger','ner'] )

#Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions 
def cleanup_pretokenize(text):
    #text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'http\S+', '', text)
    text = text.replace("'s", " ")
    text = text.replace("n't", " not ")
    text = text.replace("'ve", " have ")
    text = text.replace("'re", " are ")
    text = text.replace("I'm"," I am ")
    text = text.replace("you're"," you are ")
    text = text.replace("You're"," You are ")
    text = text.replace("-"," ")
    text = text.replace("/"," ")
    text = text.replace("("," ")
    text = text.replace(")"," ")
    text = text.replace("%"," percent ")
    return text

lmtzr = WordNetLemmatizer()
def text_cleanup(row):
    desc = row[2].strip().lower()
    tokens = [w.lemma_ for w in nlp(cleanup_pretokenize(desc))]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 3]
    #tokens = [lmtzr.lemmatize(token,'v') for token in tokens]
    row[2] = ' '.join(tokens)
    return row

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'tokens')
swr = StopWordsRemover(inputCol = 'tokens', outputCol = 'tokens_sw_removed')

Step 1: Loads your previously obtained dataset of Webhose news articles into a Spark dataframe¶ 

In [21]:
#Loads your previously obtained dataset of Webhose news articles into a Spark dataframe
import json 
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()

#read data from json file
json_data=open("IBM_english.json").readlines()
#print(json_data)

path = "/Users/rissacao/downloads/5430/IBM_english.json"
sparkDF = spark.read.json(path)
#sparkDF.printSchema()
sparkDF.createOrReplaceTempView("post")

#PDdf = pd.DataFrame(feeds_read_from_file) 
#dfObj.head(100)
#df = spark.createDataFrame(pdDF,schema=mySchema)
articleDF = spark.sql("SELECT url, title, text FROM post")
articleDF.show()

+--------------------+--------------------+--------------------+
|                 url|               title|                text|
+--------------------+--------------------+--------------------+
|http://heidloff.n...|Two self-paced Op...|Two self-paced Op...|
|https://inspirati...|How Finding a Hob...| How Finding a Ho...|
|https://seekingal...|Cognizant: Contin...|We recommend CTSH...|
|https://www.eejou...|Group Puzzles Out...|Someday, a new cl...|
|https://chogyiwrt...|When AIs go to wa...|When AIs go to wa...|
|https://www.overc...|       Tuesday Night|Tuesday Night 23:...|
|https://www.start...|ROOMZ expands int...|IOT Smart home
SE...|
|https://www.itweb...|Digital skills ar...| SAP Africa MD Ca...|
|https://www.tmcne...|LevaData Announce...|LevaData Announce...|
|https://www.readi...|Exeter Township w...|Linda Fister's st...|
|http://www.oenmag...|Nutanix appoints ...| September 2019 1...|
|https://brooklynn...|Why We Love Domai...|What are the prim...|
|https://cryptoves...|Hed

In [22]:
#Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions
ibm_columns = [0,1,2]
ibm_data = articleDF['url','title','text']

ibm_rdd = ibm_data.select('*') \
                       .rdd.map(lambda row: [row[i] for i in ibm_columns]) \
                       .filter(lambda row: row[2] is not None)
ibm_df = sqlContext.createDataFrame(ibm_rdd, 
                                           ['url','title','text'])
ibm_df.show(5)

+--------------------+--------------------+--------------------+
|                 url|               title|                text|
+--------------------+--------------------+--------------------+
|http://heidloff.n...|Two self-paced Op...|Two self-paced Op...|
|https://inspirati...|How Finding a Hob...| How Finding a Ho...|
|https://seekingal...|Cognizant: Contin...|We recommend CTSH...|
|https://www.eejou...|Group Puzzles Out...|Someday, a new cl...|
|https://chogyiwrt...|When AIs go to wa...|When AIs go to wa...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [23]:
def cossim(v1, v2): 
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / (np.sqrt(np.dot(v2, v2))+.1)


Step 2: Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions 

In [25]:
#Step 2: Cleans up and tokenizes article bodies using the RegexTokenizer and Stopword remover functions 
df_tokens = regexTokenizer.transform(ibm_df)
desc_swr = swr.transform(df_tokens)
desc_swr.show(3)
#desc_swr_half = desc_swr.limit(50000)
#desc_swr_half.show(3)
#desc_swr.write.saveAsTable('desc_swr', mode = 'overwrite')


+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 url|               title|                text|              tokens|   tokens_sw_removed|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|http://heidloff.n...|Two self-paced Op...|Two self-paced Op...|[two, self, paced...|[two, self, paced...|
|https://inspirati...|How Finding a Hob...| How Finding a Ho...|[how, finding, a,...|[finding, hobby, ...|
|https://seekingal...|Cognizant: Contin...|We recommend CTSH...|[we, recommend, c...|[recommend, ctsh,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



Step 3: Trains a Word2Vec model based on the output column produced in step 2

In [26]:
#Trains a Word2Vec model based on the output column produced in step 2
word2vec = Word2Vec(vectorSize = 300, minCount = 5, inputCol = 'tokens_sw_removed', outputCol = 'wordvectors')
model = word2vec.fit(desc_swr)
wordvectors = model.transform(desc_swr)
#wordvectors.select('wordvectors').show(1, truncate = True)
ibm_desc = wordvectors.select('url','title','wordvectors').rdd.toDF()
ibm_desc.show(5)


+--------------------+--------------------+--------------------+
|                 url|               title|         wordvectors|
+--------------------+--------------------+--------------------+
|http://heidloff.n...|Two self-paced Op...|[-0.0167177878320...|
|https://inspirati...|How Finding a Hob...|[-0.0379592568701...|
|https://seekingal...|Cognizant: Contin...|[-0.0309997186893...|
|https://www.eejou...|Group Puzzles Out...|[0.02199625305000...|
|https://chogyiwrt...|When AIs go to wa...|[-0.0195653121947...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [29]:
#Implements any sample search query, as shown in Class Exercise, and produces matching article titles

synonyms = model.findSynonyms("facebook", 20)   
synonyms.show()

#chunk = crunchbase_desc.filter(lambda r: r[1]>=0 and r[1]<1000).collect()
chunk = ibm_desc.take(50000)
#chunk = crunchbase_desc.collect()

+--------------+-------------------+
|          word|         similarity|
+--------------+-------------------+
|       twitter| 0.7924723625183105|
|      linkedin| 0.6940572261810303|
|     instagram| 0.6708571314811707|
|       youtube| 0.6688803434371948|
|        follow| 0.6683547496795654|
|     flipboard| 0.6563186645507812|
|stratisticsmrc| 0.6203067898750305|
|     smartnews| 0.6186703443527222|
|     ibi_group|  0.543597400188446|
|   ibmsecurity| 0.5070103406906128|
|         ibmix|  0.501682698726654|
|       netflix|0.49685123562812805|
| industrytoday| 0.4909372925758362|
|           pic|0.48442983627319336|
|        ibm_ix| 0.4804266393184662|
|          mash|0.47040778398513794|
|    recordings|0.46629518270492554|
|          ndtv| 0.4628705084323883|
|         ramon|0.45663002133369446|
|     subscribe| 0.4558887481689453|
+--------------+-------------------+



In [30]:
SEARCH_QUERY = "industrial AI"


query_df  = sc.parallelize([(1,SEARCH_QUERY)]).toDF(['index','text'])
query_tok = regexTokenizer.transform(query_df)
query_swr = swr.transform(query_tok)
query_swr.show()
query_vec = model.transform(query_swr)
query_vec = query_vec.select('wordvectors').collect()[0][0]
#query_vec

+-----+-------------+----------------+-----------------+
|index|         text|          tokens|tokens_sw_removed|
+-----+-------------+----------------+-----------------+
|    1|industrial AI|[industrial, ai]| [industrial, ai]|
+-----+-------------+----------------+-----------------+



In [31]:
import numpy as np
sim_rdd = sc.parallelize((i[0], i[1], float(cossim(query_vec, i[2]))) for i in chunk)
sim_df  = sqlContext.createDataFrame(sim_rdd).\
                   withColumnRenamed('_1', 'url').\
                   withColumnRenamed('_2', 'title').\
                   withColumnRenamed('_3', 'similarity').\
                   orderBy("similarity", ascending = False)
sim_df.show(20, truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|url                                                                                                                                                                                                |title                                                                                                                                                        |similarity         |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------

In [32]:
from pyspark.ml.feature import Word2VecModel