In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install Spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz

# Unzip the Spark file to the current folder
!tar xf spark-3.0.3-bin-hadoop3.2.tgz

# Install findspark
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop3.2"

In [2]:
# Start a SparkSession
import findspark
findspark.init()

# Import SparkSession
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()#Create a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").master("local[*]").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","10g").getOrCreate()


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
rdd2 = spark.sparkContext.textFile(r"D:\4.2 Last\CSE 4262 LAB\Lab 1\text1.txt")
df1=spark.read.text("/content/drive/MyDrive/practice/2010-Movies-Plot.txt")

In [5]:
# Extract the text content from the DataFrame
text_rdd = df1.rdd.map(lambda row: row.value)

# Split each line into words and flatten the list of words
word_rdd = text_rdd.flatMap(lambda line: line.split())

# Map each word to a key-value pair (word, 1)
word_count_rdd = word_rdd.map(lambda word: (word, 1))

# Reduce by key to aggregate the counts of each word
word_freq_rdd = word_count_rdd.reduceByKey(lambda a, b: a + b)

# Collect the results into a Python dictionary
word_freq_dict = word_freq_rdd.collectAsMap()

# Print the word frequency dictionary
for word, freq in word_freq_dict.items():
    print(f"{word}: {freq}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Kashawk,: 1
flock: 2
phoners.: 1
illusion:: 1
Jet": 1
Joyner: 22
Halfway: 1
Olson: 3
morbidly: 2
Weirdicht: 1
room): 1
gymnasium: 1
assembly.: 1
Weirdicht;: 1
varsity: 2
"Bob: 1
Stone",: 1
Weirdicht.: 1
bidders: 1
countries,: 1
Joyner's: 4
vanished.: 1
Stanton.: 1
Badger: 1
counselor's: 1
Badger.: 3
Olson,: 1
deal's: 1
fabricates: 1
Badger,: 1
Joyner,: 1
grazing: 1
Stanton: 3
Stone;: 1
Weirdicht,: 1
relives: 2
stripping.: 1
Darla: 1
McGuchian,: 1
Angarano): 1
Pace).: 1
filmmaker's: 1
Thurman),: 1
nuptials.: 1
unravel,: 2
Livingston,: 2
Montana,: 1
Fuller,: 2
Unemployed: 1
Fuller: 18
Fuller's: 4
fault: 2
feuds: 1
employers.: 1
prepped: 1
Fuller.: 1
directive,: 1
go;: 1
sandstone: 4
sandstone,: 1
unfocused: 1
negotiation: 1
sandstone.: 1
horses: 2
Belfry.: 1
stumbled: 1
four-hour: 1
Spending: 1
didn't,: 1
dejected.: 1
Cesar: 3
Chavez's: 2
50,000: 1
braceros—temporary: 1
agriculture,: 1
farmworkers,: 1
brutality: 1
Californi

In [6]:
common_words = ["us", "has", "all", "they", "from", "who", "what", "on", "by", "more", "as", "not", "their", "can", "new", "it", "but", "be",
                "are", "--", "i", "have", "this", "will", "for", "with", "is", "that", "in", "our", "we", "a", "of",
                 "to", "and", "the", "that's", "or", "make", "do", "you", "at", "it's", "than", "if", "know", "last",
                 "about", "no", "just", "now", "an", "because", "<p>we", "why", "we'll", "how", "two", "also", "every", "come",
                 "we've", "year", "over", "get", "take", "one", "them", "we're", "need", "want", "when", "like", "most",
                 "-", "been", "first", "where", "so", "these", "they're", "good", "would", "there", "should", "-->", "<!--",
                 "up", "i'm", "his", "their", "which", "may", "were", "such", "some", "those", "was", "here",
                "she", "he", "its", "her", "his", "don't", "i've", "what's", "didn't", "shouldn't", "(applause.)", "let's", "doesn't"]

In [7]:
# Filter out common words from the word frequency vector using a single filter operation
filtered_word_freq_rdd = word_freq_rdd.filter(lambda word_freq: word_freq[0] not in common_words)

filtered_word_freq_dict = filtered_word_freq_rdd.collectAsMap()

# Print the filtered word frequency dictionary
for word, freq in filtered_word_freq_dict.items():
    print(f"{word}: {freq}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Kashawk,: 1
flock: 2
phoners.: 1
illusion:: 1
Jet": 1
Joyner: 22
Halfway: 1
Olson: 3
morbidly: 2
Weirdicht: 1
room): 1
gymnasium: 1
assembly.: 1
Weirdicht;: 1
varsity: 2
"Bob: 1
Stone",: 1
Weirdicht.: 1
bidders: 1
countries,: 1
Joyner's: 4
vanished.: 1
Stanton.: 1
Badger: 1
counselor's: 1
Badger.: 3
Olson,: 1
deal's: 1
fabricates: 1
Badger,: 1
Joyner,: 1
grazing: 1
Stanton: 3
Stone;: 1
Weirdicht,: 1
relives: 2
stripping.: 1
Darla: 1
McGuchian,: 1
Angarano): 1
Pace).: 1
filmmaker's: 1
Thurman),: 1
nuptials.: 1
unravel,: 2
Livingston,: 2
Montana,: 1
Fuller,: 2
Unemployed: 1
Fuller: 18
Fuller's: 4
fault: 2
feuds: 1
employers.: 1
prepped: 1
Fuller.: 1
directive,: 1
go;: 1
sandstone: 4
sandstone,: 1
unfocused: 1
negotiation: 1
sandstone.: 1
horses: 2
Belfry.: 1
stumbled: 1
four-hour: 1
Spending: 1
didn't,: 1
dejected.: 1
Cesar: 3
Chavez's: 2
50,000: 1
braceros—temporary: 1
agriculture,: 1
farmworkers,: 1
brutality: 1
Californi

In [11]:
sorted_word_freq_rdd = filtered_word_freq_rdd.sortBy(lambda word_freq: word_freq[1], ascending=False)
sorted_word_freq_list = sorted_word_freq_rdd.collect()
top_30_words_rdd = sorted_word_freq_rdd.take(30)

for word, freq in top_30_words_rdd:
    print(f"{word}: {freq}")

him: 2892
The: 2426
into: 1544
out: 1390
He: 1290
after: 1068
She: 963
tells: 946
then: 861
After: 849
while: 836
In: 787
before: 763
back: 737
had: 723
They: 710
When: 669
finds: 656
being: 654
him.: 629
find: 624
As: 622
only: 540
other: 540
off: 539
go: 529
takes: 522
goes: 501
her.: 498
next: 455
