In [2]:
#library for pdf files only
import PyPDF2
#library for docx files only
import docx
# importing re module
import re

In [3]:
# Function to extract text from a DOCX file
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [4]:
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        return "\n".join([page.extract_text() for page in reader.pages])

In [5]:
# Function to clean the extracted text but keep commas and full stops
def clean_text(text):
    # Remove special characters except commas and full stops, and reduce extra spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\w\s,.]', '', text)  # Keep only alphanumeric characters, commas, and full stops
    return text.strip().lower()  # Convert to lowercase and strip leading/trailing spaces

In [6]:
# Function to handle both file types
def extract_and_clean_text(file_path):
    if file_path.endswith('.docx'):
        extracted_text = extract_text_from_docx(file_path)
    elif file_path.endswith('.pdf'):
        extracted_text = extract_text_from_pdf(file_path)
    else:
        return "Unsupported file format."
    
    # Clean the extracted text
    return clean_text(extracted_text)

In [7]:
# Example usage
file_path =  'C:\\Users\\farhan\\Downloads\\AI AVATAR- MOVIE SCRIPT DATASET PDF\\AI AVATAR- MOVIE SCRIPT DATASET PDF\\AI AVATAR PROJECT MOVIE SCRIPT PDFS\\batman-begins-2005.pdf'
data = extract_and_clean_text(file_path)
print(data)

batman begins by david goyer black. a low keening which becomes screeching that builds and builds until red flickers through black as the screen bursts into life clouds of reeling bats silhouetted against a blood red sky, bolting away from camera, massing in the sky... forming a density the shape of an enormous batlike symbol. more bats mass, swamping the symbol, darkening the screen to black. distant childrens laughter which comes closer as sunlight flickers through black. sunlight through trees running through a summer garden. a boy. chasing a girl. the boy reaches a victorian greenhouse. stands in the doorway catching his breath. this is bruce wayne, aged 8, and we are 1 ext. gardens, wayne manor  day 1 young bruce peers down rows of plants on long trestle tables. young bruce rachel no response. sunlight streams through wrought iron and glass. young bruce advances, cautious. he is grabbed from behind and pulled under a table by a young girl, aged 10. this is rachel. she puts her han

Task 1. Entity Recognition (NER: 

Identify key entities such as characters, locations, dates, organizations, etc., from the movie stories.

In [8]:
import re
import nltk
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize

In [9]:
# Download NLTK resources if needed (run this once)
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


True

In [10]:
# Function to extract person names using NER
def extract_person_names(data):
    # Tokenize the text and tag parts of speech
    words = word_tokenize(data)
    tagged = pos_tag(words)
    
    # Perform Named Entity Recognition to identify person names
    named_entities = ne_chunk(tagged, binary=False)
    
    # Extract person names from named entities
    person_names = set()
    for chunk in named_entities:
        if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
            name = ' '.join(c[0] for c in chunk)  # Join multi-word names
            person_names.add(name)
    
    return person_names

In [11]:
# Function to extract locations, dates, and organizations
def extract_locations_dates_orgs(data):
    # Define regex patterns for locations, dates, and organizations
    location_pattern = re.compile(r"\b(Castle|Village|Cottage|Woods|France|Villeneuve)\b", re.IGNORECASE)
    date_pattern = re.compile(r"\b(\d{1,2}(?:st|nd|rd|th)? \w+ \d{4}|\d{4})\b")  # Dates like "18th century" or specific years
    org_pattern = re.compile(r"\b(Company|Corporation|Guild|Society|Royal Guard|Army|Institute|Organization|Association)\b", re.IGNORECASE)

    # Find all locations, dates, and organizations
    locations = set(location_pattern.findall(data))
    dates = set(date_pattern.findall(data))
    organizations = set(org_pattern.findall(data))

    return {
        'locations': locations,
        'dates': dates,
        'organizations': organizations
    }

In [12]:
# Function to summarize extracted entities
def summarize_entities(person_names, entities):
    characters = ', '.join(person_names)
    locations = ', '.join(entities['locations'])
    dates = ', '.join(entities['dates'])
    organizations = ', '.join(entities['organizations'])
    
    summary = f"The story revolves around {characters} in {locations}. The events take place in {dates}. Key organizations include {organizations}."
    return summary

In [13]:
# Extract person names (characters)
person_names = extract_person_names(data)

In [14]:
# Extract locations, dates, and organizations
entities = extract_locations_dates_orgs(data)

In [15]:
# Generate summary
summary = summarize_entities(person_names, entities)
print(summary)

The story revolves around mr.wayne, mr.fox in . The events take place in . Key organizations include corporation, army, society, company.


Task 2. Dependency Parsing:

Analyze the structure of sentences to understand the subject, verb, and object relationships.

In [16]:
import spacy

In [17]:
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

In [18]:
def simplify_sentence(sentence):
    doc = nlp(sentence)
    simple_sentences = []
    
    # Split sentence based on main subject-verb-object clauses
    for token in doc:
        if token.dep_ == "ROOT":
            root_verb = token
            subject = [w for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
            objects = [w for w in token.rights if w.dep_ in ("dobj", "attr", "prep", "pobj")]
            
            # Construct simple sentences with subject-verb-object
            if subject and objects:
                subject_text = ' '.join([w.text for w in subject])
                object_text = ' '.join([w.text for w in objects])
                simple_sentences.append(f"{subject_text} {root_verb.text} {object_text}.")
            elif subject:
                subject_text = ' '.join([w.text for w in subject])
                simple_sentences.append(f"{subject_text} {root_verb.text}.")
            else:
                simple_sentences.append(f"{root_verb.text}.")
    
    return simple_sentences


In [19]:
# Function to break down complex text into simpler sentences
def simplify_text(data):
    doc = nlp(data)
    simple_sentences = []
    
    # Process each sentence individually
    for sentence in doc.sents:
        simplified = simplify_sentence(sentence.text)
        simple_sentences.extend(simplified)
    
    return ' '.join(simple_sentences)


In [20]:

# Simplify the text
simplified_text = simplify_text(data)
print(simplified_text)

batman begins by. keening screeching. bats mass. laughter. sunlight. boy. chasing. boy reaches greenhouse. stands. this is wayne. gardens bruce peers. rachel. sunlight streams through. advances. he grabbed from. this is rachel. she puts hand over. bruce strides past. whispering. they pay lot. stares wideeyed. she smiles. he relaxes. bolts. tears. ext. disused. i see you. suppresses. bruce drops lands. ext. garden manor runs to. bruce. old bruce lifts head from. ext. garden. mister alfred int. bruce groans. he hears. bats explode from. he screamscurls against. jolt flick. darkness. eyes belong to. wayne aged. man sits. man wayne nightmare. man cell is box. light seeps through in. shouts. shrugs. courtyard line for. prisoners scattered in. eyes. man. i fought them. man. they kill you. wayne holds plate. gruel dribbled onto. wayne they kill me. path blocked. man smashes plate. he punches wayne. man contd. wayne picks himself. dust. continued continued. you re devil. boots. you re practice

Task 3. Summarization:

Extract key points or sentences from the movie story for concise narration


In [21]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [22]:
def sumy_summarize_text(data):
    parser = PlaintextParser.from_string(data, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    
    # Generate a summary of 5 sentences (you can adjust this number)
    summary = summarizer(parser.document, 5)
    
    # Combine the summary sentences into a single string
    summary_sentences = [str(sentence) for sentence in summary]
    return " ".join(summary_sentences)

In [23]:
# Summarize the movie plot
summary = sumy_summarize_text(data)
print("Summary of the plot (Sumy):\n", summary)

Summary of the plot (Sumy):
 peers inside... bats explode from the box, filling the air wayne dives away from the box, staring up at the squawking bats flinching... ducard leaps at wayne, who rolls sideways, blocking wayne turns to face ducard, but he is lost in the ninjas, bats filling the air, wayne flinching with their attacks... wayne stays low, slashes the arm of the ninja nearest him the man does not move. wayne strikes ducard in the head with the butt of his sword, douses his mask in the liquid fire and tosses it back into the mezzanine where the explosive powders are stored... ras leaps from his throne, striking at wayne with his sword wayne parries explosions roar from the balcony, shooting flame across the ceiling ras and wayne fight as explosions surround them... wayne leaps clear as flaming debris collapses onto ras, crushing him... the flames are rising, ninja bodies are strewn around, fresh explosions rip across the hall as ninjas flee and wayne spots ducard lying unconsc

4. Topic Modeling:

Identify the main themes or topics present in the story using techniques like keyword extraction.

Expected Output: Themes or keywords associated with different sections of the story.

In [24]:
import spacy
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from pprint import pprint

In [25]:
# Download NLTK stopwords if you haven't already
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farhan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

In [27]:
# Preprocessing the text (Tokenize, remove stopwords, and lemmatize)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    doc = nlp(text.lower())  # Convert to lowercase and tokenize
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return tokens

In [28]:
# Preprocess the input text
tokens = preprocess_text(data)

In [29]:
# Create a dictionary and corpus for LDA
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]

In [30]:
# Train the LDA model (you can adjust the number of topics)
lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=3, id2word=dictionary, passes=10)

In [31]:
# Function to format topics for better readability
def format_topics(lda_model):
    topics = lda_model.print_topics(num_words=5)
    formatted_topics = []
    
    for topic_num, topic in topics:
        # Extract the keywords from the topic string
        keywords = [word.split('*')[1].replace('"', '').strip() for word in topic.split('+')]
        topic_description = f"Topic {topic_num + 1}: This section of the story focuses on themes related to {', '.join(keywords)}."
        formatted_topics.append(topic_description)
    
    return formatted_topics

In [32]:
# Get and print the formatted topics
formatted_topics = format_topics(lda_model)
for topic in formatted_topics:
    print(topic)

Topic 1: This section of the story focuses on themes related to wayne, , continuous, bruce, gordon.
Topic 2: This section of the story focuses on themes related to wayne, , bruce, rachel, int.
Topic 3: This section of the story focuses on themes related to wayne, , rachel, bruce, batman.


5. Sentiment Analysis:

Gauge the emotions or sentiment expressed in different parts of the movie story.

Expected Output: Annotated text sections indicating positive, negative, or neutral sentiments.

Example: “John was happy to reunite with his sister. However, the tragedy of war loomed over them.”


In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [34]:
# Load the small English model for spaCy
nlp = spacy.load('en_core_web_sm')

In [35]:
# Function to perform sentiment analysis using VADER
def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    doc = nlp(data)  # Process text with spaCy
    results = []

    # Analyze sentiment sentence by sentence
    for sentence in doc.sents:
        sentiment = analyzer.polarity_scores(sentence.text)
        if sentiment['compound'] >= 0.05:
            label = 'Positive'
        elif sentiment['compound'] <= -0.05:
            label = 'Negative'
        else:
            label = 'Neutral'
        
        # Store the sentence with its sentiment label
        results.append((sentence.text, label))
    
    return results

In [36]:
# Analyze sentiment in the movie story
sentiments = sentiment_analysis(data)

In [37]:
# Print annotated text with sentiment labels
print("Sentiment Analysis of the story:")
for sentence, sentiment in sentiments:
    print(f"{sentence} --> {sentiment}")

Sentiment Analysis of the story:
batman begins by david goyer black. --> Neutral
a low keening which becomes screeching that builds and builds until red flickers through black as the screen bursts into life clouds of reeling bats silhouetted against a blood red sky, bolting away from camera, massing in the sky... forming a density the shape of an enormous batlike symbol. --> Negative
more bats mass, swamping the symbol, darkening the screen to black. --> Neutral
distant childrens laughter which comes closer as sunlight flickers through black. --> Positive
sunlight through trees running through a summer garden. --> Neutral
a boy. --> Neutral
chasing a girl. --> Neutral
the boy reaches a victorian greenhouse. --> Positive
stands in the doorway catching his breath. --> Neutral
this is bruce wayne, aged 8, and we are 1 ext. --> Neutral
gardens, wayne manor  day 1 young bruce peers down rows of plants on long trestle tables. --> Neutral
young bruce rachel no response. --> Negative
sunlight 

6. Character/Plot Relationship Analysis:

Identify the relationships between characters and how they interact throughout the story.

Expected Output: Visual or text representation of character interactions and dynamics.

Example: “John and Sara share a complex relationship, moving from rivalry to friendship


In [45]:
import spacy
from collections import defaultdict

In [46]:
# Load spaCy's English model with dependency parsing capabilities
nlp = spacy.load('en_core_web_sm')


In [47]:
# Function to extract character names (PERSON entities) using spaCy's NER
def extract_characters(data):
    doc = nlp(data)
    characters = set()

    # Find all PERSON entities (assumed to be characters)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            characters.add(ent.text)

    return list(characters)

In [51]:
# Function to find character interactions based on subject-verb-object relationships
def character_relationships(data, characters):
    doc = nlp(data)
    relationships = defaultdict(list)

    # Iterate over each sentence and check for subject-verb-object involving characters
    for sentence in doc.sents:
        subject = None
        verb = None
        obj = None
        
        for token in sentence:
            # If the token is a character and it's a subject (nsubj), set it as the subject
            if token.dep_ == "nsubj" and token.text in characters:
                subject = token.text

            # If the token is a verb, set it as the action
            if token.pos_ == "VERB":
                verb = token.lemma_

            # If the token is a character and it's an object (dobj), set it as the object
            if token.dep_ == "dobj" and token.text in characters:
                obj = token.text
        
        # If both subject and object are characters, we have a relationship
        if subject and verb and obj:
            relationships[(subject, obj)].append(verb)

    return relationships

In [52]:
# Function to print character relationships with inferred actions
def print_relationships(relationships):
    for (subject, obj), actions in relationships.items():
        unique_actions = set(actions)  # Get unique actions for clarity
        action_description = ', '.join(unique_actions)
        print(f"{subject} and {obj} have the following interactions: {action_description}")

In [59]:
# Step 1: Extract character names
characters = extract_characters(data)

In [60]:
# Step 2: Analyze character relationships (based on subject-verb-object structure)
relationships = character_relationships(data, characters)

In [61]:
# Step 3: Print the relationships between characters with inferred actions
print("\nCharacter Relationships and Interactions:")
print_relationships(relationships)


Character Relationships and Interactions:
wayne and wayne have the following interactions: grab, s
bruce and alfred have the following interactions: alfre
alfred and bruce have the following interactions: look
chill and rachel have the following interactions: share
wayne and wall have the following interactions: reveal
wayne and rachel have the following interactions: watch, remember, walk
earle and wayne have the following interactions: stare, walk
gotham and wayne have the following interactions: watch
wayne and earle have the following interactions: approach
wayne and mr.earle have the following interactions: buy
bruce and cloak have the following interactions: drop
wayne and gordon have the following interactions: have
rachel and bruce have the following interactions: say
corridor and rachel have the following interactions: billow
gordon and rachel have the following interactions: see, carry
corridor and cloak have the following interactions: open
roof and wall have the following 

7. Coreference Resolution:

Resolve pronouns to ensure consistent narration, making the text easier to follow.
Expected Output: More readable text by replacing pronouns with the appropriate nouns.

Example: Instead of “He went there,” it becomes “John went to the market.”


In [62]:
import spacy

In [63]:
# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

In [64]:
# Function to resolve pronouns manually based on the preceding entities
def resolve_coreferences(data):
    doc = nlp(data)
    resolved_text = data

    # Store the last seen named entity (assumed to be a character)
    last_person = None

    # Iterate over tokens and replace pronouns with the last seen character
    for token in doc:
        if token.ent_type_ == "PERSON":
            last_person = token.text  # Update the last seen character
        elif token.pos_ == "PRON" and token.text.lower() in ["he", "she", "him", "her"]:
            if last_person:
                resolved_text = resolved_text.replace(token.text, last_person)

    return resolved_text

In [65]:
# Resolve coreferences in the sample script
resolved_text = resolve_coreferences(data)

In [66]:
print("\nResolved Text:")
print(resolved_text)


Resolved Text:
batman begins by david goyer black. a low keening which becomes screeching that builds and builds until red flickers through black as tadvances screen bursts into life clouds of reeling bats silhouetted against a blood red sky, bolting away from camera, massing in tadvances sky... forming a density tadvances shape of an enormous batlike symbol. more bats mass, swamping tadvances symbol, darkening tadvances screen to black. distant childrens laughter which comes closer as sunlight flickers through black. sunlight through trees running through a summer garden. a boy. chasing a girl. tadvances boy reacadvancess a victorian greenhouse. stands in tadvances doorway catching his breath. this is bruce wayne, aged 8, and we are 1 ext. gardens, wayne manor  day 1 young bruce peers down rows of plants on long trestle tables. young bruce racadvancesl no response. sunlight streams through wrought iron and glass. young bruce advances, cautious. advances is grabbed from behind and pul