In [8]:
# Set the PySpark environment variables
import os
os.environ['SPARK_HOME'] = "/home/rajesh/CSL7100/PySpark/spark-3.4.2-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [10]:
spark = SparkSession.builder \
    .appName("DataFrame-Gutenberg-Network") \
    .master("local[6]") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "24") \
    .getOrCreate()

### Using RDDs

In [18]:
from pyspark.sql.functions import regexp_extract

books_df = (
    spark.sparkContext
         .wholeTextFiles("/home/rajesh/CSL7100/Assignment1/data/D184MB/*.txt") #reads the whole text files from specified path
         .toDF(["file_path", "text"]) #convert RDD to data frame
         .withColumn(
             "file_name",  # add a new column calle file_name and extract it using regular expression
             regexp_extract("file_path", r"([^/]+$)", 1)
         )
         .select("file_name", "text") #keep file_name and text column
)



                                                                                

In [19]:
from pyspark.sql.functions import regexp_extract, col

#create a new data frame with columns
books_df_simple = (
    books_df
    .withColumn("Author",         #create
        regexp_extract("text", r"(?i)Author:\s*(.*)", 1)
    )
    .withColumn("release_date",
        regexp_extract("text", r"(?i)Release Date:\s*(.*)", 1)
    )
    .select("file_name", "Author", "release_date")
)

books_df_simple.show(truncate=False)

[Stage 8:>                                                          (0 + 1) / 1]

+---------+-------------------------------------+----------------------------+
|file_name|Author                               |release_date                |
+---------+-------------------------------------+----------------------------+
|10.txt   |                                     |March 2, 2011 [EBook #10]   |
|101.txt  |Bruce Sterling                       |January, 1994               |
|102.txt  |Mark Twain (Samuel Clemens)          |January, 1994               |
|103.txt  |Jules Verne                          |May 15, 2008 [EBook #103]   |
|104.txt  |Franklin Delano Roosevelt            |May 14, 2008 [EBook #104]   |
|105.txt  |Jane Austen                          |June 5, 2008 [EBook #105]   |
|106.txt  |Edgar Rice Burroughs                 |June 5, 2008 [EBook #106]   |
|107.txt  |Thomas Hardy                         |February, 1994  [eBook #107]|
|108.txt  |Arthur Conan Doyle                   |July 8, 2007 [EBook #108]   |
|109.txt  |Edna St. Vincent Millay              |Jun

26/02/13 18:01:02 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 8 (TID 8): Attempting to kill Python Worker
                                                                                

In [20]:
books_df_simple.unpersist(blocking=True) if books_df_simple.is_cached else None
del books_df_simple

In [21]:
from pyspark.sql.functions import (
    regexp_extract, regexp_replace,
    col, trim
)



books_df = (
    books_df
    
    
    .withColumn(
        "Author_raw",
        regexp_extract(
            col("text"),
            r"(?im)^Author:\s*([^\n\r]+)",  # First extract Authors loosely
            1
        )
    )
    
    
    .withColumn(
        "Author",
        trim(
            regexp_replace(
                col("Author_raw"),
                r",\s*AKA.*|--Pseudonym.*",   # Remove alias if present, this was seen when we inspected the data
                ""
            )
        )
    )
    
    .withColumn(
        "release_year",
        regexp_extract(          # Extract 4-digit release year only
            col("text"),
            r"(?im)^Release Date:\s*.*?(\d{4})",
            1
        ).cast("int")
    )
    
    # Remove rows with missing or empty authors
    .filter(col("Author").isNotNull() & (col("Author") != ""))
    
    # Select required columns
    .select("file_name", "Author", "release_year")
)

books_df.show(truncate=False)



[Stage 9:>                                                          (0 + 1) / 1]

+---------+-------------------------------------+------------+
|file_name|Author                               |release_year|
+---------+-------------------------------------+------------+
|101.txt  |Bruce Sterling                       |1994        |
|102.txt  |Mark Twain (Samuel Clemens)          |1994        |
|103.txt  |Jules Verne                          |2008        |
|104.txt  |Franklin Delano Roosevelt            |2008        |
|105.txt  |Jane Austen                          |2008        |
|106.txt  |Edgar Rice Burroughs                 |2008        |
|107.txt  |Thomas Hardy                         |1994        |
|108.txt  |Arthur Conan Doyle                   |2007        |
|109.txt  |Edna St. Vincent Millay              |2008        |
|11.txt   |Lewis Carroll                        |1994        |
|110.txt  |Thomas Hardy                         |1994        |
|111.txt  |Gene Stratton-Porter                 |2006        |
|112.txt  |Richard McGowan                      |1994  

26/02/13 18:01:16 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 9 (TID 9): Attempting to kill Python Worker
                                                                                

In [23]:
from pyspark.sql.functions import min, col

# Get first publication year
author_year_df = (
    books_df
    .groupBy("Author")                         # group by author
    .agg(min("release_year").alias("first_year"))  # earliest year
)
author_year_df.show(truncate=False)

[Stage 10:>                                                         (0 + 2) / 2]

+-----------------------------------+----------+
|Author                             |first_year|
+-----------------------------------+----------+
|Jane Austen                        |1994      |
|Rene Doumic                        |2006      |
|Sax Rohmer                         |2008      |
|Amy Lowell                         |2008      |
|John Gower                         |2008      |
|Kenneth Grahame                    |1995      |
|Lewis Carroll                      |1992      |
|Virginia Woolf                     |2006      |
|Robert Service                     |1995      |
|Joseph Conrad                      |1995      |
|J. Stark Munro                     |1995      |
|Henry Lawson                       |2008      |
|Cicero                             |2008      |
|William Wells Brown                |2008      |
|Coalition for Networked Information|2008      |
|Edna St. Vincent Millay            |2008      |
|Robert Louis Stevenson             |1992      |
|William Morris     

                                                                                

In [24]:
X = 5   # influence range threshold


In [25]:
from pyspark.sql.functions import col

influence_df = (
    author_year_df.alias("a")                          # first author
    .join(author_year_df.alias("b"))                   # second author
    .filter(col("a.Author") != col("b.Author"))        # exclude same author
    .filter(
        (col("b.first_year") - col("a.first_year") > 0) &   # later publication
        (col("b.first_year") - col("a.first_year") <= X)    # within X years
    )
    .select(
        col("a.Author").alias("author1"),              # influencing author
        col("b.Author").alias("author2")               # influenced author
    )
    .distinct()                                        # remove duplicates
)

influence_df.show(truncate=False)   # display edges
#author1 influenced author 2

[Stage 13:>                                                         (0 + 2) / 2]

+-----------+--------------------+
|author1    |author2             |
+-----------+--------------------+
|Jane Austen|Kenneth Grahame     |
|Jane Austen|Robert Service      |
|Jane Austen|Joseph Conrad       |
|Jane Austen|J. Stark Munro      |
|Jane Austen|Henry James         |
|Jane Austen|Project Gutenberg   |
|Jane Austen|Omar Khayyam        |
|Jane Austen|Martin Luther       |
|Jane Austen|Unknown             |
|Jane Austen|Robert Harris       |
|Jane Austen|Lao-Tze             |
|Jane Austen|Rupert Brooke       |
|Jane Austen|Geoffrey Chaucer    |
|Jane Austen|Joyce Kilmer        |
|Jane Austen|Franz Josef Haydn   |
|Jane Austen|Michael Hart        |
|Jane Austen|United States Army  |
|Jane Austen|Gene Stratton Porter|
|Jane Austen|Edith Wharton       |
|Jane Austen|Jean Clottes        |
+-----------+--------------------+
only showing top 20 rows



                                                                                

In [26]:
author_year_df.unpersist(blocking=True) if author_year_df.is_cached else None
del author_year_df

In [27]:
from pyspark.sql.functions import count

out_degree_df = (
    influence_df
    .groupBy("author1")                 # group by source
    .agg(count("*").alias("out_degree"))  # count edges
)

out_degree_df.show()


                                                                                

+--------------------+----------+
|             author1|out_degree|
+--------------------+----------+
|         Jane Austen|        82|
|         Rene Doumic|        73|
|          Sax Rohmer|         4|
|          Amy Lowell|         4|
|          John Gower|         4|
|     Kenneth Grahame|        34|
|       Lewis Carroll|       102|
|      Virginia Woolf|        73|
|      Robert Service|        34|
|       Joseph Conrad|        34|
|      J. Stark Munro|        34|
|        Henry Lawson|         4|
|              Cicero|         4|
| William Wells Brown|         4|
|Coalition for Net...|         4|
|Edna St. Vincent ...|         4|
|Robert Louis Stev...|       102|
|      William Morris|         4|
|         Henry James|        34|
| Henry W. Longfellow|        67|
+--------------------+----------+
only showing top 20 rows



In [31]:
top5_out = (
    out_degree_df
    .orderBy("out_degree", ascending=False)  # sort descending
    .limit(5)                                # top 5
)

top5_out.show(truncate=False)


[Stage 39:>                                                         (0 + 2) / 2]

+------------------+----------+
|author1           |out_degree|
+------------------+----------+
|Charles Dodgson   |116       |
|Alexander Hamilton|116       |
|Peter Mark Roget  |116       |
|Herman Melville   |116       |
|James M. Barrie   |116       |
+------------------+----------+



                                                                                

In [32]:
out_degree_df.unpersist(blocking=True) if out_degree_df.is_cached else None
del out_degree_df


In [30]:
in_degree_df = (
    influence_df
    .groupBy("author2")                 # group by target
    .agg(count("*").alias("in_degree"))   # count edges
)

in_degree_df.show()


[Stage 31:>                                                         (0 + 2) / 2]

+--------------------+---------+
|             author2|in_degree|
+--------------------+---------+
|     Kenneth Grahame|       41|
|          Sax Rohmer|       36|
|          Amy Lowell|       36|
|          John Gower|       36|
|    Homer and Hesiod|       36|
|         Jane Austen|       30|
|        Walter Scott|       21|
|         Rene Doumic|        6|
|      Robert Service|       41|
|       Joseph Conrad|       41|
|      J. Stark Munro|       41|
|Gilbert K. Cheste...|       89|
| Charles W. Chesnutt|       89|
|        John Fox Jr.|       36|
|  Thomas Nelson Page|       36|
|       Lewis Carroll|        7|
|      Virginia Woolf|        6|
|      Torquato Tasso|       41|
|       John Fox, Jr.|       89|
|        Henry Lawson|       36|
+--------------------+---------+
only showing top 20 rows



                                                                                

In [33]:
top5_in = (
    in_degree_df
    .orderBy("in_degree", ascending=False)  # sort descending
    .limit(5)                               # top 5
)

top5_in.show(truncate=False)


[Stage 44:>                                                         (0 + 2) / 2]

+--------------------+---------+
|author2             |in_degree|
+--------------------+---------+
|Jerome K. Jerome    |97       |
|J. M. Barrie        |97       |
|Kate Douglas Wiggin |89       |
|Stewart Edward White|89       |
|John Fox, Jr.       |89       |
+--------------------+---------+



                                                                                

In [34]:
in_degree_df.unpersist(blocking=True) if in_degree_df.is_cached else None
del in_degree_df

In [43]:
spark.stop()