In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def url_script(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page,'lxml')
    text = [ p.text for p in soup.find(class_ = 'elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content').find_all('p') ]
    print(url)
    return text

In [None]:
data_text= url_script('https://scrapsfromtheloft.com/2020/08/23/umberto-eco-narrative-structure-ian-fleming/')

https://scrapsfromtheloft.com/2020/08/23/umberto-eco-narrative-structure-ian-fleming/


In [None]:
print(data_text[2])

In 1953 Ian Fleming published the first novel in the 007 series, Casino Royale. Being a first work, it is subject to the then current literary influence, and in the fifties, which had abandoned the traditional detective whodunit trail in favour of violent action, it was impossible to ignore the presence of Spillane.


In [None]:
df_scrap= pd.DataFrame()

In [None]:
df_scrap['raw-data']= np.array(data_text)

In [None]:
df_scrap.head()

Unnamed: 0,raw-data
0,"In the following excerpt, originally published..."
1,by Umberto Eco
2,In 1953 Ian Fleming published the first novel ...
3,"To Spillane, Casino Royale owes, beyond doubt,..."
4,In the second place Bond is obsessed by an ima...


In [None]:
df_scrap.to_csv("scrap_raw_data.csv", index=True)

# PreProcessing

In [None]:
len(data_text)

124

In [None]:
documents = []
stemmer = WordNetLemmatizer()
for i in range(0, len(data_text)):
    # Remove all the special characters
    doc = re.sub(r'\W', ' ', str( data_text[i]))

    # remove all punctuations
    doc = re.sub(re.escape(string.punctuation), '', doc)

    # remove all single characters
    doc = re.sub(r'\s+[a-zA-Z]\s+', ' ',  doc)

    # Remove single characters from the start
    doc = re.sub(r'\^[a-zA-Z]\s+', ' ',  doc)

    # Substituting multiple spaces with single space
    doc = re.sub(r'\s+', ' ',  doc, flags=re.I)

    # Removing prefixed 'b'
    doc = re.sub(r'^b\s+', '',  doc)

    # Converting to Lowercase
    doc = doc.lower()

    # Lemmatization
    doc =  doc.split()
    doc = [stemmer.lemmatize(word) for word in  doc]
    doc = ' '.join( doc)
    doc = [ word for word in  doc.split() if word not in stopwords.words("english")]
    doc = ' '.join( doc)

    documents.append( doc)


In [None]:
documents[0]

'following excerpt originally published italian 1965 eco offer detailed examination narrative formula fleming employed bond novel strategy eco regard basis success 007 saga'

In [None]:
df = pd.DataFrame()
df['original']= data_text
df["Document"]= documents

In [None]:
df.to_csv("clean_data.csv", index=False)

In [None]:
df= pd.read_csv('clean_data.csv')

In [None]:
df.head()

Unnamed: 0,original,Document
0,"In the following excerpt, originally published...",following excerpt originally published italian...
1,by Umberto Eco,umberto eco
2,In 1953 Ian Fleming published the first novel ...,1953 ian fleming published first novel 007 ser...
3,"To Spillane, Casino Royale owes, beyond doubt,...",spillane casino royale owes beyond doubt least...
4,In the second place Bond is obsessed by an ima...,second place bond obsessed image japanese expe...


In [None]:
x = re.sub("\s", "//", df["Document"][0], 2)
print(x)

following//excerpt//originally published italian 1965 eco offer detailed examination narrative formula fleming employed bond novel strategy eco regard basis success 007 saga


In [None]:
txt = "The rain in Spain"
x = re.search("ai", txt)
print(x) #this will print an object

<re.Match object; span=(5, 7), match='ai'>


In [None]:
#Check if the string contains "a" followed by exactly two "c" characters:

x = re.findall("suc{2}", df["Document"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['succ']
Yes, there is at least one match!


In [None]:
#Check if the string contains either "bond" or "saga":

x = re.findall("bond|saga|007", df["Document"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['bond', '007', 'saga']
Yes, there is at least one match!


In [None]:
#Check if the string contains "ai" followed by 1 or more "c" characters:

x = re.findall("suc+", df["Document"][0])
#x = re.findall("suc*", df["Document"][0])  #Check if the string contains "ai" followed by 0 or more "c" characters:

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['succ']
Yes, there is at least one match!


In [None]:
#Check if the string ends with 'saga':

x = re.findall("saga$", df["Document"][0])
if x:
  print(f"Yes, the string ends with {x}")
else:
  print("No match")

Yes, the string ends with ['saga']


In [None]:
#Check if the string starts with 'following':

x = re.findall("^following", df["Document"][0])
if x:
  print("Yes, the string starts with 'hello'")
else:
  print("No match")

Yes, the string starts with 'hello'


In [None]:
#Search for a sequence that starts with "it", followed by two (any) characters, and an "n": alian

x = re.findall("it....n", df["Document"][0])
print(x)

['italian']


In [None]:
#Find all digit characters:

x = re.findall("\d", df["Document"][0])
print(x)


['1', '9', '6', '5', '0', '0', '7']


In [None]:
#Check if the string ends with "Spain":

x = re.findall("saga\Z", df["Document"][0])

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['saga']
Yes, there is a match!


In [None]:
#Return a match at every NON word character (characters NOT between a and Z. Like "!", "?" white-space etc.):

x = re.findall("\W+",  df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ', ', ', ' ', ' ', ' ', ' ', ' ', ', ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ', ', ' ', ' ', ' ', ' ', ' “', ' ', ' ', ' ', ' ', ' ', ' ', ' ‘', '’ ', '.”']
Yes, there is at least one match!


In [None]:
#Return a match at every word character (characters from a to Z, digits from 0-9, and the underscore _ character):

x = re.findall("\w*", df["original"][0])

print(" ".join(x))

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


In  the  following  excerpt   originally  published  in  Italian  in  1965   Eco  offers  a  detailed  examination  of  the  narrative  formula  that  Fleming  employed  in  all  the  Bond  novels   a  strategy  Eco  regards  as   the  basis  of  the  success  of  the   007   saga   
Yes, there is at least one match!


In [None]:
#Return a match at every NON white-space character:

x = re.findall("\S", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['I', 'n', 't', 'h', 'e', 'f', 'o', 'l', 'l', 'o', 'w', 'i', 'n', 'g', 'e', 'x', 'c', 'e', 'r', 'p', 't', ',', 'o', 'r', 'i', 'g', 'i', 'n', 'a', 'l', 'l', 'y', 'p', 'u', 'b', 'l', 'i', 's', 'h', 'e', 'd', 'i', 'n', 'I', 't', 'a', 'l', 'i', 'a', 'n', 'i', 'n', '1', '9', '6', '5', ',', 'E', 'c', 'o', 'o', 'f', 'f', 'e', 'r', 's', 'a', 'd', 'e', 't', 'a', 'i', 'l', 'e', 'd', 'e', 'x', 'a', 'm', 'i', 'n', 'a', 't', 'i', 'o', 'n', 'o', 'f', 't', 'h', 'e', 'n', 'a', 'r', 'r', 'a', 't', 'i', 'v', 'e', 'f', 'o', 'r', 'm', 'u', 'l', 'a', 't', 'h', 'a', 't', 'F', 'l', 'e', 'm', 'i', 'n', 'g', 'e', 'm', 'p', 'l', 'o', 'y', 'e', 'd', 'i', 'n', 'a', 'l', 'l', 't', 'h', 'e', 'B', 'o', 'n', 'd', 'n', 'o', 'v', 'e', 'l', 's', ',', 'a', 's', 't', 'r', 'a', 't', 'e', 'g', 'y', 'E', 'c', 'o', 'r', 'e', 'g', 'a', 'r', 'd', 's', 'a', 's', '“', 't', 'h', 'e', 'b', 'a', 's', 'i', 's', 'o', 'f', 't', 'h', 'e', 's', 'u', 'c', 'c', 'e', 's', 's', 'o', 'f', 't', 'h', 'e', '‘', '0', '0', '7', '’', 's', 'a', 'g',

In [None]:
x = re.findall("\s", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
Yes, there is at least one match!


In [None]:
#Return a match at every no-digit character:

x = re.findall("\D", df["original"][0])

print("".join(x))

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

In the following excerpt, originally published in Italian in , Eco offers a detailed examination of the narrative formula that Fleming employed in all the Bond novels, a strategy Eco regards as “the basis of the success of the ‘’ saga.”
Yes, there is at least one match!


In [None]:
#Check if the string contains any digits (numbers from 0-9):

x = re.findall("\d*", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1965', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '007', '', '', '', '', '', '', '', '', '']
Yes, there is at least one match!


In [None]:
#Check if the string contains any digits (numbers from 0-9):

x = re.findall("\d+", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['1965', '007']
Yes, there is at least one match!


In [None]:
#Check if "ally" is present, but NOT at the beginning of a word:

x = re.findall(r"\Bally", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ally']
Yes, there is at least one match!


In [None]:
#Check if "sa" is present at the beginning of a WORD: saga

x = re.findall(r"\bsa", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['sa']
Yes, there is at least one match!


In [None]:
#Check if "ga" is present at the end of a WORD: saga

x = re.findall(r"ga\b", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['ga']
Yes, there is at least one match!


In [None]:
df["original"][0]

'In the following excerpt, originally published in Italian in 1965, Eco offers a detailed examination of the narrative formula that Fleming employed in all the Bond novels, a strategy Eco regards as “the basis of the success of the ‘007’ saga.”'

In [None]:
#Check if the string starts with "The":

x = re.findall("\AIn", df["original"][0])

print(x)

if x:
  print("Yes, there is a match!")
else:
  print("No match")

['In']
Yes, there is a match!


In [None]:
#Check if the string has any "of" characters:

x = re.findall("[of]+", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")


['fo', 'o', 'o', 'o', 'off', 'o', 'of', 'fo', 'o', 'o', 'o', 'o', 'of', 'of']
Yes, there is at least one match!


In [None]:
#Check if the string has any characters between a and n:

x = re.findall("[a-n]+", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['n', 'he', 'f', 'll', 'ing', 'e', 'ce', 'iginall', 'bli', 'hed', 'in', 'alian', 'in', 'c', 'ffe', 'a', 'de', 'ailed', 'e', 'amina', 'i', 'n', 'f', 'he', 'na', 'a', 'i', 'e', 'f', 'm', 'la', 'ha', 'leming', 'em', 'l', 'ed', 'in', 'all', 'he', 'nd', 'n', 'el', 'a', 'a', 'eg', 'c', 'ega', 'd', 'a', 'he', 'ba', 'i', 'f', 'he', 'cce', 'f', 'he', 'aga']
Yes, there is at least one match!


In [None]:
#Check if the string has other characters than 0, d, or n:

x = re.findall("[^ond]+", df["original"][0])

print(x)

if x:
  print("Yes, there is at least one match!")
else:
  print("No match")

['I', ' the f', 'll', 'wi', 'g excerpt, ', 'rigi', 'ally publishe', ' i', ' Italia', ' i', ' 1965, Ec', ' ', 'ffers a ', 'etaile', ' exami', 'ati', ' ', 'f the ', 'arrative f', 'rmula that Flemi', 'g empl', 'ye', ' i', ' all the B', ' ', 'vels, a strategy Ec', ' regar', 's as “the basis ', 'f the success ', 'f the ‘007’ saga.”']
Yes, there is at least one match!
