In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession, functions as F

session_conf = (
    SparkConf()
    .setAppName("Parquet app")
    .setMaster("local[4]")
)

spark = (
    SparkSession.builder
    .config(conf=session_conf)
    .getOrCreate()
)

In [2]:
# Download data from https://huggingface.co/datasets/Splend1dchan/phone-squad-parquet/tree/main
# -> download train.parquet , I am taking it just as a data example to manipulate

## About the dataset 
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

In [9]:
# load the downloaded data and select only certain columns
df = spark.read.parquet("../data/mydata2/texts.parquet").select("id", "title", "context", "question", "answers")

In [10]:
df.count()

87599

In [11]:
# take a look at the schema on our current DataFrame
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- context: string (nullable = true)
 |-- question: string (nullable = true)
 |-- answers: struct (nullable = true)
 |    |-- text: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- answer_start: array (nullable = true)
 |    |    |-- element: integer (containsNull = true)



In [12]:
df.schema

StructType([StructField('id', StringType(), True), StructField('title', StringType(), True), StructField('context', StringType(), True), StructField('question', StringType(), True), StructField('answers', StructType([StructField('text', ArrayType(StringType(), True), True), StructField('answer_start', ArrayType(IntegerType(), True), True)]), True)])

In [13]:
df.columns

['id', 'title', 'context', 'question', 'answers']

In [14]:
F.col("title")

Column<'title'>

In [15]:
F.column("title")

Column<'title'>

In [16]:
F.expr("title + A")

Column<'(title + A)'>

In [17]:
F.expr("title") + "A"

Column<'(title + A)'>

In [18]:
# refer to a specific DataFrame’s column
df.col("id")

AttributeError: 'DataFrame' object has no attribute 'col'

In [19]:
df.first()

Row(id='5733be284776f41900661182', title='University_of_Notre_Dame', context='Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', question='To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', answers=Row(text=['Saint Bernadette Soubirous'], answer_start=[515]))

In [21]:
df.select("id", "title").show(1, False)

+------------------------+------------------------+
|id                      |title                   |
+------------------------+------------------------+
|5733be284776f41900661182|University_of_Notre_Dame|
+------------------------+------------------------+
only showing top 1 row



In [22]:
df.select(
      F.expr("id"),
      F.col("title"),
      F.column("question"))\
.show(2, False)

+------------------------+------------------------+-----------------------------------------------------------------------+
|id                      |title                   |question                                                               |
+------------------------+------------------------+-----------------------------------------------------------------------+
|5733be284776f41900661182|University_of_Notre_Dame|To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?|
|5733be284776f4190066117f|University_of_Notre_Dame|What is in front of the Notre Dame Main Building?                      |
+------------------------+------------------------+-----------------------------------------------------------------------+
only showing top 2 rows



In [23]:
# df.select(
#       F.expr("id + --- + title"),
#       F.column("context"))\
# .show(2, False)

In [24]:
# change returned col name
df.select(F.expr("id as row_id")).show(1, False)

+------------------------+
|row_id                  |
+------------------------+
|5733be284776f41900661182|
+------------------------+
only showing top 1 row



In [25]:
# change returned col name and then change it back to its original with alias
df.select(F.expr("id as row_id").alias("id")).show(1, False)

+------------------------+
|id                      |
+------------------------+
|5733be284776f41900661182|
+------------------------+
only showing top 1 row



In [26]:
df.selectExpr("id as row_id", "id").show(1, False)

+------------------------+------------------------+
|row_id                  |id                      |
+------------------------+------------------------+
|5733be284776f41900661182|5733be284776f41900661182|
+------------------------+------------------------+
only showing top 1 row



In [27]:
df.selectExpr(
"*", # all originalcolumns
"(title = title) as new_col")\
.show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|                  id|               title|             context|            question|             answers|new_col|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|   true|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|   true|
+--------------------+--------------------+--------------------+--------------------+--------------------+-------+
only showing top 2 rows



### Get unique rows in the dataset

####  select count of unique titles

In [37]:
df.selectExpr("count(distinct(title))").show(2)

+---------------------+
|count(DISTINCT title)|
+---------------------+
|                  442|
+---------------------+



In [38]:
df.select("title").distinct().count()

442

In [44]:
# all titles
df.select("title").count()

87599

So we have 442 unique titles out of 87599 

In [42]:
# showing 5 unique titles
df.select("title").distinct().show(5, False)

+----------------------------------------------+
|title                                         |
+----------------------------------------------+
|Hanover                                       |
|Sino-Tibetan_relations_during_the_Ming_dynasty|
|Alps                                          |
|Gymnastics                                    |
|Saint_Barth%C3%A9lemy                         |
+----------------------------------------------+
only showing top 5 rows



In [47]:
df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|             context|            question|             answers|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|The Basilica of t...|{[the Main Buildi...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is the Grott...|{[a Marian place ...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What sits on top ...|{[a golden statue...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



#### Select unique title, questions

In [48]:
df.select("title", "question").distinct().count()

87370

In [49]:
87599 - 87370

229

Total of 87599 rows we have 229 duplicate title and question. Lets see if there is also duplicate title, question and answer (do we have same questions that are answered differently?). 

In [50]:
df.select("title", "question", "answers").distinct().count()

87507

In [51]:
87599 - 87507

92

We do have 92 questions for the same title that have the same answer

In [52]:
df.distinct().count()

87599

we do not have duplicated rows however, so the answer or context must be the once that differ.

In [54]:
df.select("id", "title", "question", "answers").distinct().count()

87599

so it is the id that differs 

In [55]:
df.select("title", "context", "question", "answers").distinct().count()

87507

and these 92 duplicate questions answers have the same context.
<br>
Next lets remove these rows

### Remove duplicate question answers

In [62]:
df = df.dropDuplicates(["title", "context", "question", "answers"])

In [63]:
df.count()

87507

### f

In [29]:
from pyspark.sql.functions import lit
# SELECT *, 1 as One FROM dfTable LIMIT 2
df.select(F.expr("*"), lit(1).alias("One")).show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+---+
|                  id|               title|             context|            question|             answers|One|
+--------------------+--------------------+--------------------+--------------------+--------------------+---+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|  1|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|  1|
+--------------------+--------------------+--------------------+--------------------+--------------------+---+
only showing top 2 rows



In [30]:
df.withColumn("withintitle", F.expr("title == title"))\
    .show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|                  id|               title|             context|            question|             answers|withintitle|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|       true|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|       true|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+
only showing top 2 rows



In [31]:
df.filter(F.col("title") =="University_of_Notre_Dame").show(2)


+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|             context|            question|             answers|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [32]:
df.where(F.col("title") == "University_of_Notre_Dame").show(2)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  id|               title|             context|            question|             answers|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|5733be284776f4190...|University_of_Not...|Architecturally, ...|To whom did the V...|{[Saint Bernadett...|
|5733be284776f4190...|University_of_Not...|Architecturally, ...|What is in front ...|{[a copper statue...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [33]:
df.where(F.col("title") == "University_of_Notre_Dame").where(F.col("title") != "University_of_Notre_Dame").show(2)

+---+-----+-------+--------+-------+
| id|title|context|question|answers|
+---+-----+-------+--------+-------+
+---+-----+-------+--------+-------+



In [34]:
df.select("title", "id").distinct().count()

87599

In [35]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()


43954

In [None]:
df2 = df.groupby("title").agg(F.count("id").alias("count"), F.collect_list("id").alias("ids"))

In [None]:
df2.persist()

In [None]:
df2.filter(F.col("count") < 25).head()

In [None]:
x = df2.collect()

In [None]:
df.groupby("title").agg(F.count("id").alias("count"), F.collect_set("id").alias("ids")).filter(F.col("count") < 25).head(10)


In [None]:
# register df a temp view to query it with sql
df.createTempView("table")

In [None]:
spark.sql(
"""
SELECT count(id) as count, collect_list(id) as ids
FROM table
GROUP BY title
HAVING count < 25
"""
).head(10)

In [None]:
spark.sql(
"""
WITH temp AS
(SELECT count(id) as count, collect_list(id) as ids
FROM table
GROUP BY title
)
SELECT *
FROM temp
WHERE count < 25
"""
).head(10)