In [1]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('paragraphs.csv')

# Drop empty rows from the DataFrame
df = df.dropna()

# Print the resulting DataFrame
print(df)

                                                      0
0                                         FIRE  & BLOOD
2                                   GEORGE R. R. MARTIN
25                                                  -�\
26                                       Banta1n Boc)ks
27                                             New York
...                                                 ...
4368                             \nPenguin Ranaom House
4370                  What’s next on your reading list?
4371                     Discover your next great read!
4374  Get personalized book picks and up-to-date new...
4375                                       Sign up now.

[3186 rows x 1 columns]


In [2]:
df = df.rename(columns={'0': 'text'})
df.sample(5)

Unnamed: 0,text
560,Maegor Targaryen and Tyanna of the Tower were ...
3793,Gedmund Great-Axe (who had been so seasick dur...
1561,“I thought that was the most hideous thing tha...
2381,"After the loss of his fingers, Viserys I never..."
3412,The two armies came together two days from the...


In [3]:
# Define a regular expression to match URLs
url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# Define a regular expression to match symbols
symbol_regex = r'[^a-zA-Z0-9\s]'

# Remove URLs and symbols from the "text" column
df['clean'] = df['text'].str.replace(url_regex, '')
df['clean_tex'] = df['clean'].str.replace(symbol_regex, '')

  df['clean'] = df['text'].str.replace(url_regex, '')
  df['clean_tex'] = df['clean'].str.replace(symbol_regex, '')


In [4]:
df

Unnamed: 0,text,clean,clean_tex
0,FIRE & BLOOD,FIRE & BLOOD,FIRE BLOOD
2,GEORGE R. R. MARTIN,GEORGE R. R. MARTIN,GEORGE R R MARTIN
25,-�\,-�\,
26,Banta1n Boc)ks,Banta1n Boc)ks,Banta1n Bocks
27,New York,New York,New York
...,...,...,...
4368,\nPenguin Ranaom House,\nPenguin Ranaom House,\nPenguin Ranaom House
4370,What’s next on your reading list?,What’s next on your reading list?,Whats next on your reading list
4371,Discover your next great read!,Discover your next great read!,Discover your next great read
4374,Get personalized book picks and up-to-date new...,Get personalized book picks and up-to-date new...,Get personalized book picks and uptodate news ...


In [5]:
import nltk

from nltk.corpus import PlaintextCorpusReader

from nltk.tokenize import sent_tokenize



In [6]:
relations = []

for text in df['clean_tex']:

    sentences = sent_tokenize(text)

    for sentence in sentences:

        # Identify named entities using the pre-trained machine learning model

        ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)))



        # Extract entity relations

        for subtree in ne_tree.subtrees():

            if subtree.label() in ['ORGANIZATION', 'PERSON', 'GPE']:

                entities = [leaf[0] for leaf in subtree.leaves()]

                if len(entities) > 1:

                    # Get the label of the subtree

                    label = subtree.label()

                    relations.append((text, label, tuple(entities)))

In [8]:
# Convert the relations list to a DataFrame
relations_df = pd.DataFrame(relations)

# Print the resulting DataFrame
print(relations_df)


                                                      0             1  \
0                                         Banta1n Bocks  ORGANIZATION   
1                                              New York           GPE   
2     Copyright  2018 by George R R Martin Illustrat...        PERSON   
3     Copyright  2018 by George R R Martin Illustrat...        PERSON   
4     Copyright  2018 by George R R Martin Illustrat...        PERSON   
...                                                 ...           ...   
7736  DOUG WHEATLEY is a comic book artist concept d...        PERSON   
7737  DOUG WHEATLEY is a comic book artist concept d...  ORGANIZATION   
7738  DOUG WHEATLEY is a comic book artist concept d...        PERSON   
7739  DOUG WHEATLEY is a comic book artist concept d...  ORGANIZATION   
7740                             \nPenguin Ranaom House        PERSON   

                             2  
0             (Banta1n, Bocks)  
1                  (New, York)  
2                  (Geor

In [9]:
relations_df.sample(5)

Unnamed: 0,0,1,2
7076,In this as in the matter of Aegons betrothal t...,PERSON,"(Lady, Baela)"
3815,The cruelty of children is known to all Prince...,ORGANIZATION,"(Princess, Helaena)"
1195,The royal clemency did not extend to all Maego...,PERSON,"(Her, Grace)"
3991,Lord Larys Strong master of whisperers then sp...,PERSON,"(Larys, Strong)"
280,In the Bite the lords of the Three Sisters had...,PERSON,"(Queen, Visenya)"


In [15]:
relations_df[0][7076]

'In this as in the matter of Aegons betrothal to Myrielle Peake Lord Unwin found himself overruled by the other regents Over his strenuous objections King Aegon and Queen Daenera descended from the castle in their litter accompanied by Lady Baela and her newborn daughter her sister Lady Rhaena with her lord husband Corwyn Corbray Grand Maester Munkun Septon Bernard the regents Manfryd Mooton and Thaddeus Rowan the knights of the Kingsguard and many other notables eager to meet Lady Baela at the docks'

In [17]:
relations_df = relations_df.rename(columns={0: 'context',1:'label',2:'entity'})
relations_df.sample(5)

Unnamed: 0,context,label,entity
5008,All told the forces gathered under Queen Rhaen...,PERSON,"(Tumbleton, Little)"
877,Thousands fled Oldtown that night streaming fr...,PERSON,"(Starry, Sept)"
7009,Lord Dalton Greyjoy did indeed possess twoandt...,ORGANIZATION,"(Casterly, Rock)"
5612,Yet neither Waters nor any of the other knight...,PERSON,"(Rhaenyra, Targaryen)"
2186,weak about him nothing indecisive as his siste...,PERSON,"(Yi, Ti)"


In [24]:
relations_df['entity'][5008]


('Tumbleton', 'Little')

In [51]:
def question_generation(row):
    entities = []
    #for index, row in ner_content.iterrows():
        #entity_type = row['label']
        #entity_values = (row['entity'])
    entities.append((row.iloc[-2], row.iloc[-1]))

    
    # Parse entities from the content
    #entities = [eval(line.strip()) for line in ner_content.strip().split('\n')]

    # Organize entities into a dictionary
    entity_dict = {}
    for entity_type, entity_value in entities:
        if entity_type not in entity_dict:
            entity_dict[entity_type] = []
        entity_dict[entity_type].append(" ".join(entity_value))
        # Define question templates for different entity types
    question_templates = {
    

        "ORG": [
            "What is the history and significance of {}?",
            "What is the purpose of {} in the story?",
            "Can you tell me more about the organization {}?",
            "What role does {} play in the story?"
        ],

        "PERSON": [
            "Who is {} and what is their role in the story?",
            "What is {} known for in the story?",
            "Can you tell me more about the character {}?",
            "What motivates {} in the story?"
        ],
        "DATE": [
            "What significant events happened on {}?",
            "What happened during the time period of {}?",
            "How does {} impact the story?"
        ],
        "GPE": [
            "What is the significance of {} in the story?",
            "What role does {} play in the story?",
            "What cultural aspects of {} are present in the story?",
            "What impact does {} have on the story?"
        ],
        "CARDINAL": [
            "How many {} are mentioned in the story?",
            "What significance does the number {} have in the story?",
            "What is the importance of {} in the story?"
        ],
        "ORDINAL": [
            "What is the significance of the {} event in the story?",
            "What happens in the story during the {} event?",
            "What impact does the {} event have on the story?"
        ],
        "PRODUCT": [
            "What is the importance of {} in the story?",
            "What role does {} play in the story?",
            "What impact does {} have on the story?"
        ]
    }
    # Generate questions based on the question templates and the extracted entities
    generated_questions = []
    for entity_type, entity_values in entity_dict.items():
        for entity_value in entity_values:
            if entity_type in question_templates:
                for template in question_templates[entity_type]:
                    question = template.format(entity_value)
                    generated_questions.append(question)
    return generated_questions
    

In [48]:
q=question_generation(relations_df['label'][20],relations_df['entity'][20]) 

In [49]:
q

['Who is George R and what is their role in the story?',
 'What is George R known for in the story?',
 'Can you tell me more about the character George R?',
 'What motivates George R in the story?']

In [52]:
# Apply the function to each row of the DataFrame and save the resulting values in a new column
relations_df['questions'] = relations_df.iloc[:, -2:].apply(question_generation, axis=1)

In [56]:
relations_df.sample(1)

Unnamed: 0,context,label,entity,questions
1734,Rhaena Targaryen was the blood of the dragon a...,PERSON,"(Casterly, Rock)",[Who is Casterly Rock and what is their role i...


In [50]:
relations_df['context'][20]

'The Princess and the Queen published in Dangerous Women edited by George R R Martin and Gardner Dozois copyright  2013 by George R R Martin and Gardner Dozois'

In [57]:
pd.set_option('display.max_colwidth', None)

In [60]:
relations_df.sample(1)

Unnamed: 0,context,label,entity,questions
2933,When it was done Lord Rogar seemed weary It seems I will not die with axe in hand after all he told the king sadly Nor did he Rogar Baratheon Lord of Storms End and onetime Hand of the King and Lord Protector of the Realm died at Storms End half a year later in the presence of his maester his septon his brother Ser Garon and his son and heir Boremund,PERSON,"(Lord, Rogar)","[Who is Lord Rogar and what is their role in the story?, What is Lord Rogar known for in the story?, Can you tell me more about the character Lord Rogar?, What motivates Lord Rogar in the story?]"


In [61]:
relations_df.to_csv("data.csv", index=False)