# Pyspark

## Exporing worldle game results dataset 
https://www.powerlanguage.co.uk/wordle/

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/pkeskin/spark-3.3.1-bin-hadoop3"

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) #  This will format our output tables a bit nicer when not using the show() method
spark

22/12/05 17:56:15 WARN Utils: Your hostname, DESKTOP-1IULVBG resolves to a loopback address: 127.0.1.1; using 172.18.194.20 instead (on interface eth0)
22/12/05 17:56:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/05 17:56:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import multiprocessing
print(multiprocessing.cpu_count())

8


In [5]:
# Creating our SparkContext:
sc = spark.sparkContext

checking what input looks like

In [6]:
! tail -n 5 tweets.tsv

234	1491254933810728970	2022-02-09 03:37:37+00:00	ScoutsScop3	Wordle 234 6/6  ⬛🟩🟨🟨⬛ 🟨🟩⬛⬛🟩 ⬛🟩🟩⬛🟩 ⬛🟩🟩⬛🟩 ⬛🟩🟩⬛🟩 🟩🟩🟩🟩🟩  Almost took the L today 😂 but thanks to le gf, we stayed in the game
234	1491254936881045507	2022-02-09 03:37:37+00:00	Tempeteenjuin	Wordle 234 5/6  ⬜🟩🟨🟨⬜ ⬜🟩🟩⬜🟩 ⬜🟩🟩⬜🟩 ⬜🟩🟩⬜🟩 🟩🟩🟩🟩🟩
234	1491254937153593345	2022-02-09 03:37:37+00:00	minv_twt	Wordle 234 4/6  🟩⬛⬛⬛⬛ 🟩🟩⬛⬛⬛ 🟩🟩🟨🟨⬛ 🟩🟩🟩🟩🟩
234	1491254941083705345	2022-02-09 03:37:38+00:00	machucki_d	Wordle 234 2/6  ⬛⬛⬛⬛⬛ 🟩🟩🟩🟩🟩
234	1491254942639788034	2022-02-09 03:37:39+00:00	zilla_jones	Wordle 234 4/6  ⬜🟩🟩⬜🟩 ⬜🟩🟩⬜🟩 ⬜🟩🟩⬜🟩 🟩🟩🟩🟩🟩Booyah! (There are two many possibilities for those two letters.)


### 1. Launch Spark and create your RDD/DataFrame from the input file.

In [7]:
df = spark.read.options(delimiter="\t", header=True, inferSchema=True).csv("tweets.tsv")

                                                                                

In [8]:
df.printSchema()

root
 |-- wordle_id: integer (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- tweet_date: timestamp (nullable = true)
 |-- tweet_username: string (nullable = true)
 |-- tweet_text: string (nullable = true)



In [9]:
df.select("*").show(5)

+---------+-------------------+-------------------+--------------+--------------------+
|wordle_id|           tweet_id|         tweet_date|tweet_username|          tweet_text|
+---------+-------------------+-------------------+--------------+--------------------+
|      210|1482553374591660037|2022-01-16 03:20:43|      bpszebes|Wordle 210 4/6  ⬛...|
|      210|1482553387937898499|2022-01-16 03:20:46|    cruisecoup|Wordle 210 4/6  ⬜...|
|      210|1482553422276698113|2022-01-16 03:20:55|    DestroVega|Wordle 210 4/6  ⬜...|
|      210|1482553436910628866|2022-01-16 03:20:58|   brenmardash|Wordle 210 3/6  ⬜...|
|      210|1482553445726908420|2022-01-16 03:21:00|   KatieHowse2|Wordle 210 3/6  ⬛...|
+---------+-------------------+-------------------+--------------+--------------------+
only showing top 5 rows



### 2. Which Wordle puzzle in the dataset was the most tweeted about?

In [10]:
df.groupBy("wordle_id").count().orderBy('count', ascending=0).show(1)



+---------+-----+
|wordle_id|count|
+---------+-----+
|      223|15776|
+---------+-----+
only showing top 1 row



                                                                                

### 3. How many times do the words ”play”, ”the” and ”wordle” occur in the tweet text column? Return the results in descending order of count. Note: Your answer should be case-insensitive e.g. ”wordle”, ”Wordle” and ”WORDLE” could all be counted as ”wordle”).

In [11]:
import re
b = r'(\s|^|$)' 
df.select("tweet_text").rdd.flatMap(lambda x: x.tweet_text.split(" "))\
.filter(lambda word: len(re.findall(b +"play"+b,word.lower())+re.findall(b +"the"+b,word.lower())+re.findall(b +"wordle"+b,word.lower()))>0)\
.map(lambda word: (word, 1))\
.reduceByKey(lambda x, y: x + y).sortBy(lambda t: t[1], False).collect()

                                                                                

[('Wordle', 137362),
 ('the', 3372),
 ('wordle', 720),
 ('The', 241),
 ('play', 182),
 ('THE', 81),
 ('WORDLE', 37),
 ('Play', 3),
 ('PLAY', 1)]

### 4. On what day of the week did people tweet the most games? Hint: Use the date format method and Datetime Patterns.

In [12]:
from pyspark.sql.functions import dayofweek, count

In [15]:
(df.groupBy(dayofweek("tweet_date").alias("dayofweek"))
.agg(count(dayofweek("tweet_date")).alias("count"))
.orderBy('count', ascending=0).show(1))

+---------+-----+
|dayofweek|count|
+---------+-----+
|        6|28737|
+---------+-----+
only showing top 1 row

