In [23]:
import os
import nltk
import spacy
import math
import pandas as pd
from IPython.display import Markdown, display
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

nlp = spacy.load("en_core_web_sm")

In [3]:
os.chdir(os.path.dirname(os.getcwd()))

In [4]:
def calculate_entity_density(sentence:str):
    tokens = nltk.word_tokenize(sentence)
    entities = nlp(sentence).ents
    entity_density = round(len(entities)/len(tokens),3)

    return len(tokens),len(entities),entity_density

In [5]:
from pydantic import BaseModel,Field,field_validator
from typing import List

In [6]:
class InitialSummary(BaseModel):
    """
    This is an initial summary which should be long ( 4-5 sentences, ~80 words)
    yet highly non-specific, containing little information beyond the entities marked as missing.
    Use overly verbose languages and fillers (Eg. This article discusses) to reach ~80 words.
    """

    summary: str = Field(
        ...,
        description="This is a summary of the article provided which is overly verbose and uses fillers. \
        It should be roughly 80 words in length",
    )

In [11]:
class RewrittenSummary(BaseModel):
    """
    This is a new, denser summary of identical length which covers every entity
    and detail from the previous summary plus the Missing Entities.

    Guidelines
    - Make every word count : Rewrite the previous summary to improve flow and make space for additional entities
    - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
    - The new summary should be highly dense and concise yet self-contained, eg., easily understood without the Article.
    - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses"
    - Missing entities can appear anywhere in the new summary

    An Entity is a real-world object that's assigned a name - for example, a person, country a product or a book title.
    """

    summary: str = Field(
        ...,
        description="This is a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities. It should have the same length ( ~ 80 words ) as the previous summary and should be easily understood without the Article",
    )
    absent: List[str] = Field(
        ...,
        default_factory=list,
        description="this is a list of Entities found absent from the new summary that were present in the previous summary",
    )
    missing: List[str] = Field(
        default_factory=list,
        description="This is a list of 1-3 informative Entities from the Article that are missing from the new summary which should be included in the next generated summary.",
    )
        
    
    # @field_validator("summary")
    # def min_length(cls, v: str):
    #     tokens = nltk.word_tokenize(v) 
    #     num_tokens = len(tokens)
    #     if num_tokens < 60:
    #         raise ValueError(
    #             "The current summary is too short. Please make sure that you generate a new summary that is around 80 words long."
    #         )
    #     return v
    
    # @field_validator("missing")
    # def has_missing_entities(cls, missing_entities: List[str]):
    #     if len(missing_entities) == 0:
    #         raise ValueError(
    #             "You must identify 1-3 informative Entities from the Article which are missing from the previously generated summary to be used in a new summary"
    #         )
    #     return missing_entities
    
    # @field_validator("absent")
    # def has_no_absent_entities(cls, absent_entities: List[str]):
    #     absent_entity_string = ",".join(absent_entities)
    #     if len(absent_entities) > 0:
    #         print(f"Detected absent entities of {absent_entity_string}")
    #         raise ValueError(
    #             f"Do not omit the following Entities {absent_entity_string} from the new summary"
    #         )
    #     return absent_entities
    
    # @field_validator("summary")
    # def min_entity_density(cls, v: str):
    #     tokens = nltk.word_tokenize(v)
    #     num_tokens = len(tokens)
    
    #     # Extract Entities
    #     doc = nlp(v) 
    #     num_entities = len(doc.ents)
    
    #     density = num_entities / num_tokens
    #     if density < 0.08: 
    #         raise ValueError(
    #             f"The summary of {v} has too few entities. Please regenerate a new summary with more new entities added to it. Remember that new entities can be added at any point of the summary."
    #         )
    
    #     return v

In [12]:
from openai import OpenAI
import instructor

client = instructor.patch(OpenAI()) 

def summarize_article(article: str, summary_steps: int = 3):
    summary_chain = []
    # We first generate an initial summary
    summary: InitialSummary = client.chat.completions.create(  
        model="gpt-4-1106-preview",
        response_model=InitialSummary,
        messages=[
            {
                "role": "system",
                "content": "Write a summary about the article that is long (4-5 sentences) yet highly non-specific. Use overly, verbose language and fillers(eg.,'this article discusses') to reach ~80 words",
            },
            {"role": "user", "content": f"Here is the Article: {article}"},
            {
                "role": "user",
                "content": "The generated summary should be about 80 words.",
            },
        ],
        max_retries=4,
    )
    prev_summary = None
    summary_chain.append(summary.summary)
    for i in range(summary_steps):
        missing_entity_message = (
            []
            if prev_summary is None
            else [
                {
                    "role": "user",
                    "content": f"Please include these Missing Entities: {','.join(prev_summary.missing)}",
                },
            ]
        )
        new_summary: RewrittenSummary = client.chat.completions.create( 
            model="gpt-4-1106-preview",
            messages=[
                {
                    "role": "system",
                    "content": """
                You are going to generate an increasingly concise,entity-dense summary of the following article.

                Perform the following two tasks
                - Identify 1-3 informative entities from the following article which is missing from the previous summary
                - Write a new denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities

                Guidelines
                - Make every word count: re-write the previous summary to improve flow and make space for additional entities
                - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
                - The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
                - Missing entities can appear anywhere in the new summary
                - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
                """,
                },
                {"role": "user", "content": f"Here is the Article: {article}"},
                {
                    "role": "user",
                    "content": f"Here is the previous summary: {summary_chain[-1]}",
                },
                *missing_entity_message,
            ],
            max_retries=4, 
            max_tokens=1000,
            response_model=RewrittenSummary,
        )
        summary_chain.append(new_summary.summary)
        prev_summary = new_summary

    return summary_chain

In [18]:
df = pd.read_parquet('./data/forward_citations.parquet')

In [19]:
text = df.head(1)['Complete Text'].tolist()[0]

In [20]:
summaries = summarize_article(text)

In [21]:
for index,summary in enumerate(summaries):
    tokens,entity,density = calculate_entity_density(summary)
    print(f"Article {index+1} -> Results (Tokens: {tokens}, Entity Count: {entity}, Density: {density})")

Article 1 -> Results (Tokens: 78, Entity Count: 2, Density: 0.026)
Article 2 -> Results (Tokens: 87, Entity Count: 10, Density: 0.115)
Article 3 -> Results (Tokens: 151, Entity Count: 15, Density: 0.099)
Article 4 -> Results (Tokens: 129, Entity Count: 12, Density: 0.093)


In [24]:
for summary in summaries:
    display(Markdown(f"\n{summary}\n"))


In an exposition marked by an interstitial navigational framework, the article meticulously articulates the proceedings surrounding a legal dispute entailing the Recording Industry Association of America, Inc. and Verizon Internet Services, Inc., contesting the use of certain legislative mandates. This discourse delineates the intricacies accompanying the issuance of subpoenas within the context of internet-enabled copyright infringements, culminating in a judicial pronouncement that reverberates through the substratum of digital information exchange jurisprudence.



The article examines a U.S. Court of Appeals case, RIAA v. Verizon Internet Services, where the RIAA's subpoenas for user information from Verizon under the Digital Millennium Copyright Act's (DMCA) provision were disputed. The Court decided the DMCA does not permit subpoenas to an ISP simply acting as a conduit for user communications, thus Verizon was not obliged to reveal its subscribers' identities. The decision has broader implications for copyright law enforcement in the digital age.



The U.S. Court of Appeals reviewed RIAA v. Verizon Internet Services, adjudicating whether the RIAA could subpoena Verizon for customer data under DMCA's 17 U.S.C. 
512(h) relating to online copyright infringement. The Court held that DMCA's provision does not authorize subpoenas to ISPs like Verizon acting merely as conduits, thus negating Verizon's obligation to reveal subscriber identities. This interpretation is reinforced by the statute's structure and definitions in Section 512(k)(1)(B), and the scope of safe harbor provisions. The adjudication took into account rules such as Federal Rule of Civil Procedure 45(c)(2)(B), implicating constitutional considerations for anonymous speech and association. The verdict shapes the enforcement of copyright law in the context of peer-to-peer file-sharing and internet privacy.



The U.S. Court of Appeals in RIAA v. Verizon Internet Services examined the DMCA's subpoena power under 17 U.S.C. 
512(h) and concluded ISPs acting as mere conduits like Verizon aren't obliged to disclose user identities. The Court's decision relied on the statutory construction of Section 512, specific limitations within Section 512(k)(1)(B), and the parameters of safe harbor provisions. Additionally, the ruling took into account Federal Rule of Civil Procedure 45(c)(2)(B), with implications for constitutional rights to anonymous speech and association. The outcome affects copyright law application and internet user privacy in the scenario of peer-to-peer file-sharing.
