### Reading and writing in natural languages


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
import pprint


def cleanSentence(sentence: str) -> str:
    sentence = sentence.split(' ')
    sentence = [
        word.strip(string.punctuation + string.whitespace) for word in sentence
    ]
    sentence = [
        word for word in sentence
        if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')
    ]
    return sentence


def cleanInput(content: str) -> list[str]:
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]


def getNgramsFromSentence(content: str, n: int) -> list[str]:
    output = []
    for i in range(len(content) - n + 1):
        output.append(content[i:i + n])
    return output


def getNgrams(content: str, n: int) -> Counter:
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [
            ' '.join(ngram) for ngram in getNgramsFromSentence(sentence, 2)
        ]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return (ngrams)


content = str(
    urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
    'utf-8')

ngrams = getNgrams(content, 2)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(ngrams)

### Add a filter for relevants words


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import string
from collections import Counter
import pprint


def isCommon(ngram: list[str]) -> bool:
    commonWords = [
        'THE', 'BE', 'AND', 'OF', 'A', 'IN', 'TO', 'HAVE', 'IT', 'I', 'THAT',
        'FOR', 'YOU', 'HE', 'WITH', 'ON', 'DO', 'SAY', 'THIS', 'THEY', 'IS',
        'AN', 'AT', 'BUT', 'WE', 'HIS', 'FROM', 'THAT', 'NOT', 'BY', 'SHE',
        'OR', 'AS', 'WHAT', 'GO', 'THEIR', 'CAN', 'WHO', 'GET', 'IF', 'WOULD',
        'HER', 'ALL', 'MY', 'MAKE', 'ABOUT', 'KNOW', 'WILL', 'AS', 'UP', 'ONE',
        'TIME', 'HAS', 'BEEN', 'THERE', 'YEAR', 'SO', 'THINK', 'WHEN', 'WHICH',
        'THEM', 'SOME', 'ME', 'PEOPLE', 'TAKE', 'OUT', 'INTO', 'JUST', 'SEE',
        'HIM', 'YOUR', 'COME', 'COULD', 'NOW', 'THAN', 'LIKE', 'OTHER', 'HOW',
        'THEN', 'ITS', 'OUR', 'TWO', 'MORE', 'THESE', 'WANT', 'WAY', 'LOOK',
        'FIRST', 'ALSO', 'NEW', 'BECAUSE', 'DAY', 'MORE', 'USE', 'NO', 'MAN',
        'FIND', 'HERE', 'THING', 'GIVE', 'MANY', 'WELL'
    ]

    for word in ngram:
        if word in commonWords:
            return True
    return False


def cleanSentence(sentence: list[str]) -> list[str]:
    sentence = sentence.split(' ')
    sentence = [
        word.strip(string.punctuation + string.whitespace) for word in sentence
    ]
    sentence = [
        word for word in sentence
        if len(word) > 1 or (word.lower() == 'a' or word.lower() == 'i')
    ]
    return sentence


def cleanInput(content: list[list[str]]) -> list[list[str]]:
    content = content.upper()
    content = re.sub('\n', ' ', content)
    content = bytes(content, "UTF-8")
    content = content.decode("ascii", "ignore")
    sentences = content.split('. ')
    return [cleanSentence(sentence) for sentence in sentences]


def getNgramsFromSentence(content: list[list[str]], n: int) -> list[str]:
    output = []
    for i in range(len(content) - n + 1):
        output.append(content[i:i + n])
        
    return output


def getNgrams(content: list[list[str]], n: int) -> Counter:
    content = cleanInput(content)
    ngrams = Counter()
    ngrams_list = []
    for sentence in content:
        newNgrams = [
            ' '.join(ngram) for ngram in getNgramsFromSentence(sentence, 2)
        ]
        ngrams_list.extend(newNgrams)
        ngrams.update(newNgrams)
    return (ngrams)


content = str(
    urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
    'utf-8')

ngrams = getNgrams(content, 2)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(ngrams)

### Markov Model


In [None]:
from urllib.request import urlopen
from typing import Literal
from random import randint

def wordListSum(wordList: dict[str,dict[str, int]]) -> int:
    if not isinstance(wordList, dict):
        raise ValueError("Input must be a dictionary.")
    
    total_sum = 0
    for values in wordList.values():
        if isinstance(values, dict):
            total_sum += sum(values.values())
        else:
            raise ValueError("Inner values must be dictionaries.")
    return total_sum


def retrieveRandomWord(wordList: dict[str, int]) -> (str | None):
    randIndex = randint(1, wordListSum(wordList))
    for word, value in wordList.items():
        randIndex -= value
        if randIndex <= 0:
            return word


def buildWordDict(text: str) -> dict:
    # Remove line breaks and quotation marks
    text = text.replace('\n', ' ')
    text = text.replace('"', '')

    # Ensures that punctuation marks are treated as proper "words",
    # so that they are included in the Markov chain
    punctuation = [',', '.', ';', ':']
    for symbol in punctuation:
        text = text.replace(symbol, ' {} '.format(symbol))
    words = text.split(' ')
    # Filter empty words
    words = [word for word in words if word != '']

    wordDict: dict[str,dict[str, int]] = {}
    for i in range(1, len(words)):
        if words[i - 1] not in wordDict:
            # Create a new dictionary for this word
            wordDict[words[i - 1]] = {}
        if words[i] not in wordDict[words[i - 1]]:
            wordDict[words[i - 1]][words[i]] = 0
        wordDict[words[i - 1]][words[i]] += 1

    return wordDict


# Generates a Markov chain of size 100
text = str(
    urlopen('http://pythonscraping.com/files/inaugurationSpeech.txt').read(),
    'utf-8')
wordDict = buildWordDict(text)

length = 100
chain = ['I']
for i in range(0, length):
    newWord = retrieveRandomWord(wordDict[chain[-1]])
    chain.append(newWord)
    
print(' '.join(chain))

### Six Degrees (breadth-first search)


In [None]:
import pymysql

conn = pymysql.connect(
    host='127.0.0.1',
    #    unix_socket='/tmp/mysql.sock',
    user='',
    passwd='',
    db='mysql',
    charset='utf8')

cur = conn.cursor()
cur.execute('USE wikipedia')


def getUrl(pageId: str):
    cur.execute('SELECT url FROM pages WHERE id = %s', (int(pageId)))
    return cur.fetchone()[0]


def getLinks(fromPageId: str):
    cur.execute('SELECT toPageId FROM links WHERE fromPageId = %s',
                (int(fromPageId)))
    if cur.rowcount == 0:
        return []
    return [x[0] for x in cur.fetchall()]


def searchBreadth(targetPageId: str, paths=[[1]]):
    # recursive
    newPaths = []
    for path in paths:
        links = getLinks(path[-1])
        for link in links:
            if link == targetPageId:
                return path + [link]
            else:
                newPaths.append(path + [link])
        return searchBreadth(targetPageId, newPaths)


nodes = getLinks(1)
targetPageId = 28624
pageIds = searchBreadth(targetPageId)

for pageId in pageIds:
    print(getUrl(pageId))

### Statistical analysis with NLTK


In [None]:
from nltk import word_tokenize
from nltk import Text

tokens = word_tokenize('Here is some not very interesting text')
text = Text(tokens)

In [None]:
from nltk.book import text6
from nltk import ngrams

fourgrams = ngrams(text6, 4)

for fourgram in fourgrams:
    if fourgram[0] == 'coconut':
        print(fourgram)

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
""" 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
"""
from nltk.tokenize import word_tokenize

from nltk import pos_tag

text = word_tokenize('Strange women lying in ponds distributing swords'
                     'is no basis for a system of government.')

pos_tag(text)

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

sentences = sent_tokenize(
    'Google is one of the best companies in the world.'
    ' I constantly google myself to see what I\'m up to.')

nouns = ['NN', 'NNS', 'NNP', 'NNPS']

for sentence in sentences:
    if 'google' in sentence.lower():
        taggedWords = pos_tag(word_tokenize(sentence))
        for word in taggedWords:
            if word[0].lower() == 'google' and word[1] in nouns:
                print(sentence)