## Import Libraries

In [21]:
import os
os.listdir('.') 
import string
import operator
from collections import OrderedDict
import nltk, re, pprint
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from nltk import word_tokenize
from bs4 import BeautifulSoup
from urllib import request
from nltk.stem import PorterStemmer 

## Perform Web Scraping

Using the BeautifulSoup function we scrape the html page and store it as a text file

In [22]:
url = "https://playground.tensorflow.org/"
response = request.urlopen(url)
html = response.read().decode('utf8')
web_file = BeautifulSoup(html).get_text()

In [23]:
web_file

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nA Neural Network Playground\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTinker With a Neural Network Right Here in Your Browser.Don’t Worry, You Can’t Break It. We Promise.\n\n\n\n\n\n\nreplay\n\n\nplay_arrow\npause\n\n\nskip_next\n\n\n\nEpoch\n\n\n\nLearning rate\n\n\n0.00001\n0.0001\n0.001\n0.003\n0.01\n0.03\n0.1\n0.3\n1\n3\n10\n\n\n\n\nActivation\n\n\nReLU\nTanh\nSigmoid\nLinear\n\n\n\n\nRegularization\n\n\nNone\nL1\nL2\n\n\n\n\nRegularization rate\n\n\n0\n0.001\n0.003\n0.01\n0.03\n0.1\n0.3\n1\n3\n10\n\n\n\n\nProblem type\n\n\nClassification\nRegression\n\n\n\n\n\n\n\n\n\n\nData\n\n\nWhich dataset do you want to use?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRatio of training to test data:\xa0\xa0XX%\n\n\n\n\n\nNoise:\xa0\xa0XX\n\n\n\n\n\nBatch size:\xa0\xa0XX\n\n\n\n\n\n            Regenerate\n          \n\n\n\n\nFeatures\nWhich properties do you want to feed in?\n\n\n\n\n\n\n\n\n\n\nClick anywhere to edit.\nWeight/Bias is 0.2.\n\n\n\n\n\n\n\n\n\n\n

We divide the file into tokens

In [24]:
tokens_web = word_tokenize(web_file)

## Data Preprocessing - 1

Now we remove punctuations, change the words into lowercase, remove stopwords and remove numbers from the list of words.

In [26]:
punc_web = [''.join(char for char in strings if char not in string.punctuation) for strings in tokens_web]

punc_web = [word for word in punc_web if word]

lower_web = [word.lower() for word in punc_web]

filtered_web = [word for word in lower_web if word not in stopwords.words('english')]

no_digit_web = [x for x in filtered_web if not (x.isdigit() 
                                         or x[0] == '-' and x[1:].isdigit())]

In [27]:
ps = PorterStemmer() 
final_web = [ps.stem(word) for word in no_digit_web ]

## Read the Text File

In [29]:
f = open('ss2.txt')
text_file = f.read()

In [30]:
tokens_text = word_tokenize(text_file)

## Data Preprocessing - 2

We perform the same steps that we performed earlier. Remove punctuations, change the words into lowercase, remove stopwords and remove numbers from the list of words.

In [32]:
punc_text = [''.join(char for char in strings if char not in string.punctuation) 
                   for strings in tokens_text]
punc_text = [string for string in punc_text if string]
lower_text = [x.lower() for x in punc_text]
filtered_text = [word for word in lower_text if word not in stopwords.words('english')]
no_digit_text = [x for x in filtered_text if not (x.isdigit() 
                                         or x[0] == '-' and x[1:].isdigit())]

In [33]:
final_text = [ps.stem(word) for word in no_digit_text ]

## Common Words 

In [35]:
print('\nCommon words from both the files are : \n')
print(set(final_web).intersection(final_text))


Common words from both the files are : 

{'epoch', 'make', 'none', 'type', 'gener', 'googl', 'tensorflow', 'next', 'batch', 'enabl', 'specif', 'like', 'use', 'train', 'github', 'build', 'also', 'let', 'free', 'addit', 'continu', 'add', 'initi', 'sourc', 'origin', 'overview', 'start', 'deep', 'chang', 'data', 'allow', 'work', 'predict', 'given', 'time', 'click', 'feed', 'output', 'open', 'size', 'one', 'loss', 'learn', 'first'}


## Top 15 Words

In [36]:
results = {}
for i in final_text:
        results[i] = final_web.count(i) 

sorted_result = sorted(results.items(), key=operator.itemgetter(1),reverse=True)

print('\nThe top 15 technical, NN or ML related based on their frequency : \n')
for i in range(0,15):
    print(sorted_result[i])


The top 15 technical, NN or ML related based on their frequency : 

('data', 6)
('output', 6)
('use', 5)
('learn', 5)
('one', 3)
('predict', 2)
('train', 2)
('github', 2)
('origin', 2)
('also', 2)
('work', 2)
('sourc', 2)
('loss', 2)
('deep', 2)
('overview', 1)


## Meanings

In [38]:
print('\nThe words above with their meanings : \n')
for i in range(0,15):
    syns = wordnet.synsets(sorted_result[i][0])
    if syns:
        print(sorted_result[i][0].upper()+' means '+ syns[0].definition())
    else:
        continue


The words above with their meanings : 

DATA means a collection of facts from which conclusions may be drawn
OUTPUT means final product; the things produced
USE means the act of using
LEARN means gain knowledge or skills
ONE means the smallest whole number or a numeral representing this number
PREDICT means make a prediction about; tell in advance
TRAIN means public transport provided by a line of railway cars coupled together and drawn by a locomotive
ORIGIN means the place where something begins, where it springs into being
ALSO means in addition
WORK means activity directed toward making or doing something
LOSS means something that is lost
DEEP means the central and most intense or profound part
OVERVIEW means a general summary of a subject
