<a href="https://colab.research.google.com/github/mohamedyosef101/harvardx-python-for-research/blob/main/Language%20Processing/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Processing

In [1]:
# word counting

text = "Hello, my name is Mohamed and this is me trying to make NLP easier!"

def count_words(text):
  """
  count the number of times each word appears in a given text.
  """
  word_counts = {}
  for word in text.split(" "):
    if word in word_counts:
      word_counts[word] += 1
    else:
      word_counts[word] = 1
  return word_counts

count_words(text)

{'Hello,': 1,
 'my': 1,
 'name': 1,
 'is': 2,
 'Mohamed': 1,
 'and': 1,
 'this': 1,
 'me': 1,
 'trying': 1,
 'to': 1,
 'make': 1,
 'NLP': 1,
 'easier!': 1}

In [2]:
# remove punctuation and charcter case affect
def count_words(text):
  """
  count the number of times each word appears in a given text.
  """
  word_counts = {}
  text = text.lower()
  skips = {".", ",", ";", ":", "'", '"', "!", "?"}
  for skip in skips:
    text = text.replace(skip, "")
  for word in text.split(" "):
    if word in word_counts:
      word_counts[word] += 1
    else:
      word_counts[word] = 1
  return word_counts

count_words(text)

{'hello': 1,
 'my': 1,
 'name': 1,
 'is': 2,
 'mohamed': 1,
 'and': 1,
 'this': 1,
 'me': 1,
 'trying': 1,
 'to': 1,
 'make': 1,
 'nlp': 1,
 'easier': 1}

In [3]:
# using the counter object for fast code

from collections import Counter

def fast_count(text):
  """
  count the number of times each word appears in a given text using a
  Counter object.
  """
  word_counts = {}
  text = text.lower()

  skips = {".", ",", ";", ":", "'", '"', "!", "?"}
  for skip in skips:
    text = text.replace(skip, "")

  word_counts = Counter(text.split(" "))

  return word_counts

fast_count(text)

Counter({'hello': 1,
         'my': 1,
         'name': 1,
         'is': 2,
         'mohamed': 1,
         'and': 1,
         'this': 1,
         'me': 1,
         'trying': 1,
         'to': 1,
         'make': 1,
         'nlp': 1,
         'easier': 1})

In [4]:
# Are the two functions give the same output?
print(count_words(text) == fast_count(text))

True


# Reading in a Book
We will use utf8 encoding to encode characters in our files to help computer read them.

In [5]:
# get the books

import requests
import zipfile
from pathlib import Path

# Setup path to data folder
data_path = Path("data/")
books_path = data_path / "txt-books"

# If the books folder doesn't exist, download it and prepare it...
if books_path.is_dir():
    print(f"{books_path} directory exists.")
else:
    print(f"Did not find {books_path} directory, creating one...")
    books_path.mkdir(parents=True, exist_ok=True)

    # Download pizza, steak, sushi data
    with open(data_path / "txt-books.zip", "wb") as f:
        request = requests.get("https://github.com/mohamedyosef101/harvardx-python-for-research/raw/main/Language%20Processing/txt-books.zip")
        print("Downloading books...")
        f.write(request.content)

    # Unzip txt-books
    with zipfile.ZipFile(data_path / "txt-books.zip", "r") as zip_ref:
        print("Unzipping books...")
        zip_ref.extractall(books_path)


Did not find data/txt-books directory, creating one...
Downloading books...
Unzipping books...


In [14]:
def read_book(book_path):
  """
  Read a book and return it as a string.
  """
  with open(book_path, "r", encoding="utf8") as f:
    book = f.read()
    book = book.replace("\n", "").replace("\r", "").replace("  ", " ")
  return book

In [15]:
rj_book_path = "./data/txt-books/English/shakespeare/Romeo and Juliet.txt"
rj_book = read_book(rj_book_path)
len(rj_book)

161572

In [16]:
# search for line
rj_book.find("What's in a name?")

40797

In [17]:
# see the big context
idx = rj_book.find("What's in a name?")
big_context = rj_book[idx-100: idx+1000]
big_context

"hand, nor foot,  Nor arm, nor face, nor any other part  Belonging to a man. O, be some other name!  What's in a name? That which we call a rose  By any other name would smell as sweet.  So Romeo would, were he not Romeo call'd,  Retain that dear perfection which he owes  Without that title. Romeo, doff thy name;  And for that name, which is no part of thee,  Take all myself. Rom. I take thee at thy word.  Call me but love, and I'll be new baptiz'd;  Henceforth I never will be Romeo. Jul. What man art thou that, thus bescreen'd in night,  So stumblest on my counsel? Rom. By a name  I know not how to tell thee who I am.  My name, dear saint, is hateful to myself,  Because it is an enemy to thee.  Had I it written, I would tear the word. Jul. My ears have yet not drunk a hundred words  Of that tongue's utterance, yet I know the sound.  Art thou not Romeo, and a Montague? Rom. Neither, fair saint, if either thee dislike. Jul. How cam'st thou hither, tell me, and wherefore?  The orchard wa

In [72]:
def word_stats(word_counts, prints=False):
  "Return number of unique words and word frequencies."
  num_unique = len(word_counts)
  counts = word_counts.values() # every dic. has keys and values
  most_freq, rep_times = max(word_counts), max(counts)
  if prints:
    print(f"There are {num_unique} unique words in the book" +
          f"\nThe most frequent word is: {most_freq}" +
          f" (repeated {rep_times} times)")
  return num_unique, most_freq.replace("\ufeff", ""), rep_times

In [73]:
num_unique, most_freq, rep_times = word_stats(count_words(rj_book))

In [74]:
word_stats(count_words(rj_book), prints=True)

There are 4671 unique words in the book
The most frequent word is: ﻿the (repeated 4480 times)


(4671, 'the', 4480)

# Reading Multiple files

If you take a look at the structure of the book directory, you will find that it looks like the following:

```
txt-books/
  language/
    author/
      book title
```

In [75]:
import os
books_dir = 'data/txt-books'

for language in os.listdir(books_dir):
  for author in os.listdir(books_dir + "/" + language):
    for title in os.listdir(books_dir + "/" + language + "/" + author):
      inputfile = books_dir + "/" + language + "/" + author + "/" + title
      print(f" Reading {inputfile}...")
      book = read_book(inputfile)
      num_unique, most_freq, rep_times = word_stats(count_words(book))

 Reading data/txt-books/French/de Maupassant/Contes de la Becasse.txt...
 Reading data/txt-books/French/de Maupassant/Le Horla.txt...
 Reading data/txt-books/French/de Maupassant/Boule de Suif.txt...
 Reading data/txt-books/French/de Maupassant/L'inutile beautÇ.txt...
 Reading data/txt-books/French/de Maupassant/La Main Gauche.txt...
 Reading data/txt-books/French/de Maupassant/Claire de Lune.txt...
 Reading data/txt-books/French/de Maupassant/Œuvres complètes de Guy de Maupassant.txt...
 Reading data/txt-books/French/de Maupassant/La Maison Tellier.txt...
 Reading data/txt-books/French/de Maupassant/La petite roque.txt...
 Reading data/txt-books/French/diderot/Entretien d'un päre avec ses enfants.txt...
 Reading data/txt-books/French/diderot/Regrets sur ma vieille robe de chambre.txt...
 Reading data/txt-books/French/diderot/Les deux amis de Bourbonne.txt...
 Reading data/txt-books/French/diderot/Ceci n'est pas un conte.txt...
 Reading data/txt-books/French/diderot/L'oiseau blanc.txt.

# Using pandas