# Pyspark Install

In [8]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Task 1

In [18]:
import numpy as np
from pyspark import SparkContext

# Initialize SparkContext
spark_context = SparkContext.getOrCreate()
np.random.seed(3)
n = np.random.randint(0, 11, 100)

# Create RDD using parallelize function
rdd = spark_context.parallelize(n)
function = rdd.countByValue()

for n, count in function.items():
    print(f"{n} come into view {count} times.")

10 come into view 10 times.
8 come into view 13 times.
9 come into view 5 times.
3 come into view 4 times.
0 come into view 15 times.
5 come into view 11 times.
7 come into view 10 times.
6 come into view 4 times.
4 come into view 8 times.
1 come into view 13 times.
2 come into view 7 times.


# Task 2

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
from pyspark import SparkContext
import re

# Initialize SparkContext
spark_context = SparkContext.getOrCreate()
text = spark_context.textFile('/content/drive/MyDrive/Assigmnet/text8/text8')

# Split each line into words 
words_rdd = text.flatMap(lambda line: re.findall(r'\w+', line.lower()))
freq = words_rdd.countByValue()

# Filter words containing the letter 'a'. We can choose any letter instead of a
words_freq = words_rdd.filter(lambda word: 'a' in word).countByValue()

for w, c in freq.items(): # where w for word and c for count
    print(f"'{w}' appears {c} times.")
    
for w, c in words_freq.items():
    print(f"'{w}' letter 'a' appears {c} times.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
'arml' letter 'a' appears 1 times.
'amatyc' letter 'a' appears 1 times.
'orowan' letter 'a' appears 1 times.
'erraticimpact' letter 'a' appears 1 times.
'polanyiana' letter 'a' appears 1 times.
'denaturant' letter 'a' appears 2 times.
'autothermal' letter 'a' appears 1 times.
'bioalcohol' letter 'a' appears 2 times.
'denitrification' letter 'a' appears 2 times.
'butterfat' letter 'a' appears 5 times.
'indigestability' letter 'a' appears 1 times.
'pantothenic' letter 'a' appears 1 times.
'counterclaim' letter 'a' appears 1 times.
'kradjian' letter 'a' appears 1 times.
'ladled' letter 'a' appears 1 times.
'resealable' letter 'a' appears 1 times.
'paks' letter 'a' appears 1 times.
'unrefrigerated' letter 'a' appears 2 times.
'somatotropin' letter 'a' appears 1 times.
'pagaent' letter 'a' appears 1 times.
'strippergates' letter 'a' appears 1 times.
'biomagnetism' letter 'a' appears 1 times.
'magnetoreceptor' letter 'a' appear

# Data Frame Task 

In [20]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("IrisData").getOrCreate()
data = spark.read.json("/content/drive/MyDrive/Assigmnet/iris.json")

In [21]:
from pyspark.sql.functions import corr
data.select(corr("petalLength", "petalWidth")).show()

+-----------------------------+
|corr(petalLength, petalWidth)|
+-----------------------------+
|            0.962865431402796|
+-----------------------------+



In [22]:
data.filter(data.petalLength >= 1.4).select("sepalLength", "sepalWidth", "species").show()

+-----------+----------+-------+
|sepalLength|sepalWidth|species|
+-----------+----------+-------+
|        5.1|       3.5| setosa|
|        4.9|       3.0| setosa|
|        4.6|       3.1| setosa|
|        5.0|       3.6| setosa|
|        5.4|       3.9| setosa|
|        4.6|       3.4| setosa|
|        5.0|       3.4| setosa|
|        4.4|       2.9| setosa|
|        4.9|       3.1| setosa|
|        5.4|       3.7| setosa|
|        4.8|       3.4| setosa|
|        4.8|       3.0| setosa|
|        5.7|       4.4| setosa|
|        5.1|       3.5| setosa|
|        5.7|       3.8| setosa|
|        5.1|       3.8| setosa|
|        5.4|       3.4| setosa|
|        5.1|       3.7| setosa|
|        5.1|       3.3| setosa|
|        4.8|       3.4| setosa|
+-----------+----------+-------+
only showing top 20 rows

