#Hello, This project is to demonstrate Apache Spark mapreduce for wordcount and text processing using inbuilt libs and function of Python3.
###1. PySpark
###2. findspark
###3. nltk
###By 6CIT-3:
###1. Rehan Ashraf
###2. Abdul Aman Khan
###3. Mihir Suman
###4. Amal V
###5. Burhan Pasha


###Installing PySpark ( Api for Apache Spark )


In [None]:
  %pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=75f89d01d36a1b0f800d3795957fd54675549ed1d05ac8588391e59e902b47ed
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


###Installing findspark library
 To init, config Apache Spark for local python environment

In [None]:
%pip install findspark
import findspark
findspark.init()

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


##### Importing SparkSession and SparkConf RDDs(Resileint Distributed Dataset)
Creating a SparkContext.

In [None]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf


#### Since we write local [*] in the master, it will use all cores in our machine. If we said local [4] it will work with 4 cores. It can work in local / yarn / mesos and kubernetes mode.

#### getOrCreate is used to create a SparkSession if not present.


In [None]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()

###SparkContext will create the RDD from txt file

In [None]:
sc=spark.sparkContext

### Read Datafile - romeojuliet.txt file for wordcount

In [None]:
file="/content/romeojuliet.txt"

###Read 'file' into rdd 'shakespeare_rdd' and display first 100 lines.

In [None]:
shakespeare_rdd=sc.textFile(file)

In [None]:
shakespeare_rdd.take(100)

['',
 "                    WILLIAM SHAKESPEARE'S",
 '',
 '                       ROMEO & JULIET',
 '',
 '   ADAPTED FOR THE SCREEN BY CRAIG PEARCE AND BAZ LUHRMANN',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '                                       FINAL SHOOTING SCRIPT',
 '',
 '                                             October 6, 1995',
 '',
 '',
 '',
 '',
 'EXT.  HIGHWAY.  AFTERNOON.',
 '',
 'A ribbon of freeway stretching into a blue and pink late',
 'afternoon sky. A huge dark sedan, windows tinted gold,',
 'headlights blazing, powers directly for us.',
 '',
 'CUT TO: A heavy, low-slung, pickup truck traveling toward',
 'the sedan.',
 '',
 'WIDE SHOT: Sky, freeway, the cars closing.',
 '',
 'TIGHT ON: The sedan.',
 '',
 'TIGHT ON: The pickup.',
 '',
 'Like thunderous, jousting opponents, the cars pass in a',
 'deafening cacophony of noise.',
 '',
 'INT.  TRUCK.  AFTERNOON.',
 '',
 'TIGHT ON: 

###To display number of lines in txt file

In [None]:
shakespeare_rdd.count()

6247

###Cleaning RDD dateset for any redundancy

###Defining function 'low_clean_str' for converting lines to lowercase and exclude punctuation marks

#### Remove Punctuation and Transform All Words to Lowercase.

#### To exclude all punctuation marks  and convert all words to lowercase, we wrote a function like the one below.

In [None]:
def lower_clean_str(x):
  punc='!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-'
  lowercased_str = x.lower()
  for ch in punc:
    lowercased_str = lowercased_str.replace(ch, '')
  return lowercased_str

In [None]:
shakespeare_rdd = shakespeare_rdd.map(lower_clean_str)

In [None]:
shakespeare_rdd.take(40)

['',
 '                    william shakespeares',
 '',
 '                       romeo  juliet',
 '',
 '   adapted for the screen by craig pearce and baz luhrmann',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '                                       final shooting script',
 '',
 '                                             october 6 1995',
 '',
 '',
 '',
 '',
 'ext  highway  afternoon']

### Using split function to separate the words in all lines .

In [None]:
shakespeare_rdd=shakespeare_rdd.flatMap(lambda satir: satir.split(" "))

In [None]:
shakespeare_rdd.take(5)

['', '', '', '', '']

###Using filter function 'filter()' to filter out whitespaces from the RDD dataset .

In [None]:
shakespeare_rdd = shakespeare_rdd.filter(lambda x:x!='')

In [None]:
shakespeare_rdd.take(4)

['william', 'shakespeares', 'romeo', 'juliet']

### Count how many times each word occurs.
#### To make this calculation we can apply the 'reduceByKey' fucnction to transform as a (key,val) pair for RDD. Hence, first to convert “shakespeare_rdd” to (key,val) pair RDD.

#### In this new (key,val) pair RDD (shakespeare_count), key is the word and val is 1 for each word in RDD (1 represents the number for the each word in “shakespeare_rdd”).


In [None]:
shakespeare_count=shakespeare_rdd.map(lambda  word:(word,1))

In [None]:
shakespeare_count.take(100)

[('william', 1),
 ('shakespeares', 1),
 ('romeo', 1),
 ('juliet', 1),
 ('adapted', 1),
 ('for', 1),
 ('the', 1),
 ('screen', 1),
 ('by', 1),
 ('craig', 1),
 ('pearce', 1),
 ('and', 1),
 ('baz', 1),
 ('luhrmann', 1),
 ('final', 1),
 ('shooting', 1),
 ('script', 1),
 ('october', 1),
 ('6', 1),
 ('1995', 1),
 ('ext', 1),
 ('highway', 1),
 ('afternoon', 1),
 ('a', 1),
 ('ribbon', 1),
 ('of', 1),
 ('freeway', 1),
 ('stretching', 1),
 ('into', 1),
 ('a', 1),
 ('blue', 1),
 ('and', 1),
 ('pink', 1),
 ('late', 1),
 ('afternoon', 1),
 ('sky', 1),
 ('a', 1),
 ('huge', 1),
 ('dark', 1),
 ('sedan', 1),
 ('windows', 1),
 ('tinted', 1),
 ('gold', 1),
 ('headlights', 1),
 ('blazing', 1),
 ('powers', 1),
 ('directly', 1),
 ('for', 1),
 ('us', 1),
 ('cut', 1),
 ('to', 1),
 ('a', 1),
 ('heavy', 1),
 ('lowslung', 1),
 ('pickup', 1),
 ('truck', 1),
 ('traveling', 1),
 ('toward', 1),
 ('the', 1),
 ('sedan', 1),
 ('wide', 1),
 ('shot', 1),
 ('sky', 1),
 ('freeway', 1),
 ('the', 1),
 ('cars', 1),
 ('closing'

###Using 'reducebykey'function to find frequent words in the dataset.

In [None]:
shakespeare_count_RBK=shakespeare_count.reduceByKey(lambda x,y:(x+y)).sortByKey()

In [None]:
shakespeare_count_RBK.take(40)

[('1995', 1),
 ('21', 1),
 ('6', 1),
 ('60', 2),
 ('9mm', 2),
 ('a', 563),
 ('abandoned', 1),
 ('able', 1),
 ('about', 3),
 ('above', 12),
 ('abra', 24),
 ('abras', 3),
 ('abroad', 1),
 ('abrupt', 1),
 ('abruptly', 5),
 ('absolved', 1),
 ('abuse', 2),
 ('abuses', 1),
 ('accidentally', 1),
 ('accompanied', 1),
 ('according', 1),
 ('accusation', 1),
 ('accustomed', 2),
 ('ache', 1),
 ('aches', 1),
 ('achingly', 2),
 ('acoustic', 1),
 ('across', 24),
 ('actually', 1),
 ('adagio', 1),
 ('adapted', 1),
 ('address', 1),
 ('addressed', 1),
 ('addresses', 1),
 ('adept', 1),
 ('adieu', 4),
 ('adjacent', 1),
 ('adjoining', 1),
 ('adjust', 1),
 ('admired', 1)]

####Sorting the most frequent words in descending order. As the first step, we switch (key,val) pairs as (val,key).

In [None]:
shakespeare_count_RBK=shakespeare_count_RBK.map(lambda x:(x[1],x[0]))

In [None]:
shakespeare_count_RBK.take(30)

[(1, '1995'),
 (1, '21'),
 (1, '6'),
 (2, '60'),
 (2, '9mm'),
 (563, 'a'),
 (1, 'abandoned'),
 (1, 'able'),
 (3, 'about'),
 (12, 'above'),
 (24, 'abra'),
 (3, 'abras'),
 (1, 'abroad'),
 (1, 'abrupt'),
 (5, 'abruptly'),
 (1, 'absolved'),
 (2, 'abuse'),
 (1, 'abuses'),
 (1, 'accidentally'),
 (1, 'accompanied'),
 (1, 'according'),
 (1, 'accusation'),
 (2, 'accustomed'),
 (1, 'ache'),
 (1, 'aches'),
 (2, 'achingly'),
 (1, 'acoustic'),
 (24, 'across'),
 (1, 'actually'),
 (1, 'adagio')]

###The most common word is "the". However, these values are like stopwords which brings values to our analysis.

In [None]:
shakespeare_count_RBK.sortByKey(False).take(10)

[(1372, 'the'),
 (563, 'a'),
 (506, 'to'),
 (469, 'of'),
 (464, 'romeo'),
 (461, 'and'),
 (258, 'in'),
 (251, 'juliet'),
 (246, 'is'),
 (224, 'i')]

###Importing nltk lib to exclude  stopwords words,  and get the list of English stopwords.
####The 'nltk' (Natural Language Toolkit) is used for natural language processing.

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
stopwords =stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

###After excluding stopwords, the most common word is"romeo".

In [None]:
shakespeare_count_RBK = shakespeare_count_RBK.filter(lambda x: x[1] not in stopwords).sortByKey(False)

In [None]:
shakespeare_count_RBK.sortByKey(False).take(50)

[(464, 'romeo'),
 (251, 'juliet'),
 (143, 'mercutio'),
 (133, 'capulet'),
 (114, 'thou'),
 (111, 'benvolio'),
 (111, 'night'),
 (98, 'father'),
 (97, 'ext'),
 (96, 'close'),
 (96, 'nurse'),
 (92, 'cont'),
 (88, 'int'),
 (87, 'cut'),
 (84, 'car'),
 (82, 'love'),
 (81, 'laurence'),
 (79, 'tybalt'),
 (71, 'gloria'),
 (66, 'day'),
 (62, 'thy'),
 (58, 'back'),
 (56, 'thee'),
 (53, 'eyes'),
 (52, 'toward'),
 (48, 'dave'),
 (48, 'shall'),
 (47, 'face'),
 (44, 'balthasar'),
 (43, 'beach'),
 (43, 'prince'),
 (43, 'romeos'),
 (42, 'afternoon'),
 (42, 'juliets'),
 (42, 'montague'),
 (41, 'away'),
 (39, 'come'),
 (39, 'good'),
 (39, 'hand'),
 (38, 'captain'),
 (38, 'sampson'),
 (38, 'turns'),
 (37, 'gun'),
 (36, 'boys'),
 (34, 'man'),
 (33, 'door'),
 (33, 'ill'),
 (31, 'looks'),
 (30, 'black'),
 (29, 'gregory')]

###Thank You!!
 😄😄