In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=de2c759d9309c13392cf137a2de8de4e30b907dcba0859f1a7ef8337c8b92633
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("word-counts")
sc = SparkContext(conf=conf)

In [None]:
book = sc.textFile("/content/millions_from_waste.txt")
book.collect()[:10]

['The Project Gutenberg eBook of Millions from Waste, by Frederick A.',
 'Talbot',
 '',
 'This eBook is for the use of anyone anywhere in the United States and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. You may copy it, give it away or re-use it under the terms',
 'of the Project Gutenberg License included with this eBook or online at',
 'www.gutenberg.org. If you are not located in the United States, you',
 'will have to check the laws of the country where you are located before',
 'using this eBook.']

In [None]:
word_counts = book.flatMap(lambda x: x.split()).countByValue()

for i, (word, count) in enumerate(word_counts.items()):
    if i == 15: break
    print(word, count)

The 995
Project 79
Gutenberg 22
eBook 6
of 4980
Millions 3
from 923
Waste, 1
by 529
Frederick 2
A. 7
Talbot 2
This 185
is 1828
for 1049


In [None]:
import re


def preprocess_word(word: str):
    return re.sub("[^A-Za-z0-9]+", "", word.lower())

def preprocess_words(words: str):
    return [preprocess_word(word) for word in words.split()]
    
    
preprocess_words("The Project Gutenberg eBook of Millions from Waste, by Frederick A.")

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'millions',
 'from',
 'waste',
 'by',
 'frederick',
 'a']

In [None]:
book = sc.textFile("/content/millions_from_waste.txt")

word_counts = book.flatMap(preprocess_words).countByValue()
for i, (word, count) in enumerate(word_counts.items()):
    if i == 15: break
    print(word, count)

the 10099
project 91
gutenberg 31
ebook 13
of 5032
millions 22
from 974
waste 435
by 571
frederick 4
a 2090
talbot 4
this 1105
is 1834
for 1093


In [None]:
book = sc.textFile("/content/millions_from_waste.txt")

words = book.flatMap(preprocess_words)
word_counts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
word_counts.collect()[:10]

[('the', 10099),
 ('project', 91),
 ('gutenberg', 31),
 ('ebook', 13),
 ('of', 5032),
 ('millions', 22),
 ('from', 974),
 ('waste', 435),
 ('by', 571),
 ('frederick', 4)]

In [None]:
word_counts_sorted = word_counts.map(lambda x: (x[1], x[0]))
word_counts_sorted.collect()[:10]

[(10099, 'the'),
 (91, 'project'),
 (31, 'gutenberg'),
 (13, 'ebook'),
 (5032, 'of'),
 (22, 'millions'),
 (974, 'from'),
 (435, 'waste'),
 (571, 'by'),
 (4, 'frederick')]

In [None]:
word_counts_sorted = word_counts.map(lambda x: (x[1], x[0])).sortByKey()
word_counts_sorted.collect()[:10]

[(1, 'april'),
 (1, '2022'),
 (1, '67837'),
 (1, 'english'),
 (1, 'deaurider'),
 (1, 'proofreading'),
 (1, 'team'),
 (1, 'httpswwwpgdpnet'),
 (1, 'images'),
 (1, 'generously')]

In [None]:
word_counts_sorted = word_counts.map(lambda x: (x[1], x[0])).sortByKey(False)
word_counts_sorted.collect()[:10]

[(10099, 'the'),
 (5032, 'of'),
 (4296, 'to'),
 (2490, 'and'),
 (2225, 'in'),
 (2090, 'a'),
 (1834, 'is'),
 (1405, 'be'),
 (1364, 'it'),
 (1207, 'as')]

In [None]:
!pip install nltk

import nltk
nltk.download("stopwords")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
def preprocess_word(word: str):
    return re.sub("[^A-Za-z0-9]+", "", word.lower())

def preprocess_words(words: str):
    preprocessed = [preprocess_word(word) for word in words.split()]
    return [word for word in preprocessed if word not in stop_words and word != ""]
    

preprocess_words("The Project Gutenberg eBook of Millions from Waste, by Frederick A.")

['project', 'gutenberg', 'ebook', 'millions', 'waste', 'frederick']

In [None]:
book = sc.textFile("/content/millions_from_waste.txt")

words = book.flatMap(preprocess_words)
word_counts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
word_counts_sorted = word_counts.map(lambda x: (x[1], x[0])).sortByKey(False)
word_counts_sorted.collect()[:10]

[(435, 'waste'),
 (346, 'one'),
 (337, 'upon'),
 (303, 'would'),
 (286, 'may'),
 (265, 'material'),
 (218, 'process'),
 (200, 'per'),
 (196, 'tons'),
 (181, 'fat')]