In [1]:
import sys,re

import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

nlp = spacy.load('en_core_web_md')

## Parent
### Parent pattern 1
`father,{father_name} * mother,{mother_name}`

Ryan Reynolds https://www.imdb.com/name/nm0005351/bio?ref_=nm_ov_bio_sm
> **His father**, James Chester Reynolds, was a food wholesaler, and **his mother**, Tamara Lee "Tammy" (Stewart), worked as a retail-store saleswoman. He has Irish and Scottish ancestry. Between 1991-93, Ryan appeared in Fifteen (1990), a Nickleodeon series taped in 

In [9]:
displacy.render(nlp('His father, James Chester Reynolds, was a food wholesaler, and his mother, Tamara Lee "Tammy" (Stewart), worked as a retail-store saleswoman. '), style="ent")

In [8]:
parent_ryan = """
His father, James Chester Reynolds, was a food wholesaler, and his mother, Tamara Lee "Tammy" (Stewart), worked as a retail-store saleswoman. He has Irish and Scottish ancestry. Between 1991-93, Ryan appeared in Fifteen (1990), a Nickleodeon series taped in
"""
parent_ryan_doc = nlp(parent_ryan)

father_pattern = [{'LEMMA': 'father'},
           {'IS_PUNCT': True},
           {'ENT_TYPE': 'PERSON', 'OP': '+'}]    

father_matcher = Matcher(nlp.vocab) 
father_matcher.add("father", None, father_pattern) 

father_matches = father_matcher(parent_ryan_doc)

for match_id, start, end in father_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = parent_ryan_doc[start:end]   
    print(match_id, string_id, start, end, span.text)

17071697760115891398 father 2 5 father, James
17071697760115891398 father 2 6 father, James Chester
17071697760115891398 father 2 7 father, James Chester Reynolds


In [17]:
mather_pattern = [{'LEMMA': 'mother'},
           {'IS_PUNCT': True},
           {'ENT_TYPE': 'PERSON', 'OP': '+'}]    

mother_matcher = Matcher(nlp.vocab) 
mother_matcher.add("mother", None, mather_pattern) 

mother_matches = mother_matcher(parent_ryan_doc)

for match_id, start, end in mother_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = parent_ryan_doc[start:end]   
    print(match_id, string_id, start, end, span.text)

7963322251145911254 mother 15 18 mother, Tamara
7963322251145911254 mother 15 19 mother, Tamara Lee
7963322251145911254 mother 15 20 mother, Tamara Lee "
7963322251145911254 mother 15 21 mother, Tamara Lee "Tammy


need second step to extract entity:Person

In [5]:
displacy.render(nlp('mother, Tamara Lee "Tammy'), style="ent")


### Parent pattern 2

`{person_name} * born * to {parent_name} * and {parent_name}`

https://www.imdb.com/name/nm1500155/bio?ref_=nm_ov_bio_sm
>Robert Douglas Thomas Pattinson was born May 13, 1986 in London, England, to Richard Pattinson, a car dealer importing vintage cars, and Clare Pattinson (née Charlton), who worked as a booker at a model agency. He grew up in Barnes, southwest London

In [58]:
displacy.render(nlp('Robert Douglas Thomas Pattinson was born May 13, 1986 in London, England, to Richard Pattinson, a car dealer importing vintage cars, and Clare Pattinson (née Charlton), who worked as a booker at a model agency. '), style="ent")


**First use my regex:**

In [135]:
parent_robert = """
Robert Douglas Thomas Pattinson was born May 13, 1986 in London, England, to Richard Pattinson, a car dealer importing vintage cars, and Clare Pattinson (née Charlton), who worked as a booker at a model agency. He grew up in Barnes, southwest London with two older sisters. Robert discovered his love for music long before acting and started lear
"""
parent_to_pattern = r"born.+?to ([A-Z][\w \(\)]+)\b.+?and ([A-Z][\w \(\)]+)"

# parent_to_re_result = re.search(debut_pattern,text)

case2_re_matches = [(match.start(),match.end()) for match in re.finditer(parent_to_pattern,parent_robert)]
case2_re_matches

[(37, 168)]

In [137]:
# expand a little bit

a = [(start-5,end+5) if start-5>=0 else (0,end+5) for start,end in case2_re_matches]
a

[(32, 173)]

In [140]:
parent_robert[32:173]

' was born May 13, 1986 in London, England, to Richard Pattinson, a car dealer importing vintage cars, and Clare Pattinson (née Charlton), who'

In [145]:
new = """
Robert Douglas Thomas Pattinson was born May 13, 1986 in London, England, to Richard Pattinson, a car dealer importing vintage cars, and Clare Pattinson (née Charlton), who worked as a booker at a model agency. He grew up in Barnes, southwest London with two older sisters. Robert discovered his love for music long before acting and started lear
"""
# testcase = parent_syntactic(text)
testcase = nlp(new)[32:173]
print(testcase)

Charlton), who worked as a booker at a model agency. He grew up in Barnes, southwest London with two older sisters. Robert discovered his love for music long before acting and started lear



In [10]:
# extract Person Entity
def person_ent_extract(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)
    person_pattern = [{'ENT_TYPE': 'PERSON', 'OP': '+'}]
    matcher.add("person_entity", None, person_pattern)
    matches = matcher(doc)
    match_list = sorted([doc[s:e].text for _,s,e in matches],key = len)
    
    l = match_list
    # resolve sub string:
    export = [j for i,j in enumerate(l) if all(j == k or (j not in k) for k in l[i+1:])]
    return export

person_ent_extract(parent_robert[27:178])        

NameError: name 'parent_robert' is not defined


### Parent pattern 3

`son|daughter * of {parent_name} * {parent_name}`

https://www.imdb.com/name/nm0000129/bio?ref_=nm_ov_bio_sm
>Tom is the only son (among four children) of nomadic parents, Mary Lee (Pfeiffer), a special education teacher, and Thomas Cruise Mapother III, an electrical engineer. His parents were both from Louisville, Kentucky, and he has German, Irish, and English ancestry. Young Tom spent his boyhood always on the move, and by the time he was 14 he had attended 15 different schools in the U.S. and Canada. He finally settled in Glen Ridge, New Jersey with his mother and her new husband. While in high school, Tom wanted to become a priest but pretty soon he developed an interest in acting and abandoned his plans of becoming a priest, dropped out of school, and at age 18 headed for New York and a possible acting career. The next 15 years of his life are the stuff of legends. He made his film debut with a small part in Endless Love (1981) and from the outset exhibited an undeniable box office appeal to both male and female audiences.


In [95]:
parent_son_pattern = r"(?: son| daughter).*?of.*?([A-Z][\w \(\)]+).*?([A-Z][\w \(\)]+)"

parent_tom = """
Tom is the only son (among four children) of nomadic parents, Mary Lee (Pfeiffer), a special education teacher, and Thomas Cruise Mapother III, an electrical engineer. His parents were both from Louisville, Kentucky, and he has German, Irish, and English ancestry. Young Tom spent his boyhood always on the move, and by the time he was 14 he had attended 15 different schools in the U.S. and Canada. He finally settled in Glen Ridge, New Jersey with his mother and her new husba
"""

parent_re_matches = []
for match in re.finditer(parent_son_pattern,parent_tom):
    parent_re_matches.append((match.start(),match.end()))

s,e = parent_re_matches[0]

person_ent_extract(parent_tom[s:e])        

['Mary Lee', 'Pfeiffer', 'Thomas Cruise Mapother III']

In [104]:
m = re.search(parent_son_pattern,parent_tom)
for i in [0,1,2]:
    print(i,":\t",m.group(i))

0 :	  son (among four children) of nomadic parents, Mary Lee (Pfeiffer), a special education teacher, and Thomas Cruise Mapother III
1 :	 Mary Lee (Pfeiffer)
2 :	 Thomas Cruise Mapother III


## spouse

### case 1
`married|engagement|wife * {spouse_name}`

Ryan Reynolds https://www.imdb.com/name/nm0005351/bio?ref_=nm_ov_bio_sm

>He has been married to Blake Lively since September 9, 2012. They have two children. He was previously married to Scarlett Johansson.

https://www.imdb.com/name/nm0000437/bio?ref_=nm_ov_bio_sm
>Harrelson was briefly married to Nancy Simon in the 80s, and later married his former assistant, Laura Louie, with whom he has three daughters.

>Cumberbatch's engagement to theatre and opera director Sophie Hunter, whom he has known for 17 years, was announced in the "Forthcoming Marriages" section of The Times newspaper on November 5, 2014. On February 14, 2015, the couple married at the 12th century Church of St. Peter and St. Paul on the Isle of Wight followed by a reception at Mottistone Manor. They have two sons, Christopher Carlton (born 2015) and Hal Auden

In [179]:
benedict_text = """
Cumberbatch's engagement to theatre and opera director Sophie Hunter, whom he has known for 17 years, was announced in the "Forthcoming Marriages" section of The Times newspaper on November 5, 2014. On February 14, 2015, the couple married at the 12th century Church of St. Peter and St. Paul on the Isle of Wight followed by a reception at Mottistone Manor. They have two sons, Christopher Carlton (born 2015) and Hal Auden
"""

spouse_pattern = [{'LEMMA': 'engagement'},
           {'IS_ASCII': True, 'OP': '*'},
           {'ENT_TYPE': 'PERSON', 'OP': '+'},
                  {'IS_PUNCT': True}]   

spouse_matcher = Matcher(nlp.vocab) 
spouse_matcher.add("spouse", None, spouse_pattern) 

spouse_matches = spouse_matcher(nlp(benedict_text))

for match_id, start, end in spouse_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = nlp(benedict_text)[start:end]   
    print(match_id, string_id, start, end, span.text)

matched_text = """
engagement to theatre and opera director Sophie Hunter, whom he has known for 17 years, was announced in the "Forthcoming Marriages" section of The Times newspaper on November 5, 2014. On February 14, 2015, the couple married at the 12th century Church of St. Peter and St. Paul on the Isle of Wight followed by a reception at Mottistone Manor. They have two sons, Christopher Carlton (
"""

print(person_ent_extract(' His parents, Wanda Ventham and Timothy Carlton (born Timothy Carlton Congdon Cumberbatch), are'))

16640336794147003635 spouse 3 12 engagement to theatre and opera director Sophie Hunter,
16640336794147003635 spouse 3 80 engagement to theatre and opera director Sophie Hunter, whom he has known for 17 years, was announced in the "Forthcoming Marriages" section of The Times newspaper on November 5, 2014. On February 14, 2015, the couple married at the 12th century Church of St. Peter and St. Paul on the Isle of Wight followed by a reception at Mottistone Manor. They have two sons, Christopher Carlton (
['Wanda Ventham', 'Timothy Carlton Congdon Cumberbatch']


In [176]:
spouse_re_pattern = r"(?:married|engagement|wife).*?([A-Z]\w+ *[A-Z]*\w* *[A-Z]*\w*)"
spouse_re_pattern_loose = r"married.*?([A-Z][\w \(\)]+)"

pouse_re_matches = [match for match in re.finditer(spouse_re_pattern,benedict_text)]
pouse_re_matches[0].group(1)

'Sophie Hunter'

In [18]:
pitt = """
"Henry William Dalgliesh Cavill was born on the Bailiwick of Jersey, a British Crown dependency in the Channel Islands. His mother, Marianne (Dalgliesh), a housewife, was also born on Jersey, and is of Irish, Scottish and English ancestry. Henry's father, Colin Richard Cavill, a stockbroker, is of English origin (born in Chester, England). Henry is the second youngest son, with four brothers. He was privately educated at St. Michael's Preparatory School in Saint Saviour, Jersey before attending Stowe School in Buckinghamshire, England.

His interest in acting started at an early age with school play renditions of Shakespeare's "A Midsummer Night's Dream", and Sonny LaTierri in "Grease". He also starred and directed Shakespeare's "Hamlet" in the BBC documentary "40 Minutes". It was at age 17 when Henry was discovered by casting directors at school who were looking for a young boy to play Albert Mondego in The Count of Monte Cristo (2002). He went on to star in Laguna (2001), appear in BBC's The Inspector Lynley Mysteries (2001), the television film Goodbye, Mr. Chips (2002), and the television series Midsomer Murders (1997).

When Henry was 20 years old, he gained starring roles in I Capture the Castle (2003), Hellraiser: Hellworld (2005), Red Riding Hood (2006) and Tristan & Isolde (2006). He also had a minor role in the fantasy-adventure epic Stardust (2007) alongside Sienna Miller and Ben Barnes. During 2007-2010, Henry had a leading role on the television series The Tudors (2007) as Charles Brandon, 1st Duke of Suffolk. The series was a success and was nominated for a Golden Globe Award in 2007 and won an Emmy Award in 2008. Entertainment Weekly named him "Most Dashing Duke".

He also starred in Blood Creek (2006) and Woody Allen's comedy film Whatever Works (2009). On January 30, 2011, it was announced that Henry Cavill had been cast as the next Superman in Man of Steel (2013), making him the first non-American actor to play Superman. The movie was directed by Zach Snyder, produced by Christopher Nolan, and scripted by David S. Goyer. On November 7, 2011, Henry starred in Tarsem Singh's fantasy-adventure epic Immortals (2011) alongside Mickey Rourke, Freida Pinto and Luke Evans. On September 7, 2012, Henry starred in the action-thriller Cold Light of Day (2003) alongside Bruce Willis and Sigourney Weaver.

On June 10, 2013, Man of Steel (2013) kicked off its world premiere in New York City followed by London, Bailiwick of Jersey, Sicily, Madrid, Shanghai, Sydney and Tokyo. The movie became the highest-grossing Superman film to date, and the second-highest-grossing reboot of all time behind The Amazing Spider-Man (2012). Glamour magazine ranked him the #1 "Sexiest Man". In August 2014, Henry became the Ambassador for Durrell Wildlife Park and created a website and social media called #CavillConservation to help raise funds and awareness for his love of animals and conservation. On November 3, 2014, it was announced that Cavill, his brother Charlie, and London-based producer Rex Glensy, have formed their own British production company, Promethean Productions.

On August 7, 2015, The Man from U.N.C.L.E. (2015) began its premiere tour with a people's premiere at the famous Somerset House in London, followed by its world premiere in New York City, then Toronto, and Rio de Janeiro. Cavill reprised his role as Superman in Batman v Superman: Dawn of Justice (2016) and Justice League (2017).
"
"""

text = nlp(pitt)
sentences = list(text.sents)

pattern1 = [
    {"TEXT": {"REGEX": r'married|marry|engagement|wife|husband'}},
    {'IS_ASCII': True, 'OP': '*'},
    {'ENT_TYPE': 'PERSON', 'OP': '+'},
    {'IS_PUNCT': True}
]
spouse_matcher1 = Matcher(nlp.vocab)
spouse_matcher1.add("spouse", None, pattern1)
syntactic_matches = []
for sentence in sentences:
    sentence_text = sentence.text.lower()
    t_nlp = nlp(sentence.text)
    if re.search(r'married|marry|engagement|wife|husband',sentence_text):
        print(sentence_text)
#         syntactic_matches.extend([ t_nlp[s:e].text for _,s,e in spouse_matcher1(t_nlp)])

        


his mother, marianne (dalgliesh), a housewife, was also born on jersey, and is of irish, scottish and english ancestry.


In [17]:
syntactic_result = []

for match in syntactic_matches:
    syntactic_result.extend(person_ent_extract(match))
syntactic_result

[]

In [None]:
t = "Dwayne has a daughter, Simone Garcia Johnson, born in 2001, with his ex-wife Dany Garcia, and daughters, Jasmine, born in 2015, and Tiana Gia, born in 2018, with his wife, singer and songwriter Lauren Hashian."
re.findall(r"(?:married|engagement|wife).*?([A-Z]\w+ *[A-Z]*\w* *[A-Z]*\w*)",t)

In [217]:
l = ['abc','abcdef','def','defdef','polopolo']

l = sorted(l, key = len)

print("sorted(l)",l)
export = []
for index,item in enumerate(l):
    print('\n',index,item,l[index+1:])
    judge = []
    for sub_item in l[index+1:]:
        if item not in sub_item:
            judge.append(True)
        else:
            judge.append(False)
    if all(judge):
        export.append(item)
        

print(export)            

sorted(l) ['abc', 'def', 'abcdef', 'defdef', 'polopolo']

 0 abc ['def', 'abcdef', 'defdef', 'polopolo']

 1 def ['abcdef', 'defdef', 'polopolo']

 2 abcdef ['defdef', 'polopolo']

 3 defdef ['polopolo']

 4 polopolo []
['abcdef', 'defdef', 'polopolo']


In [221]:
t = """ "Michael B. Jordan, the middle of three children, was born in Santa Ana, California and raised in Newark, New Jersey. He is the son of Donna (Davis), a high school counselor, and Michael A. Jordan. His middle name, Bakari, means "noble promise" in Swahili. (He is not related to, or named after, basketball legend Michael Jordan.)

Jordan has starred in three of the most critically acclaimed television dramas of the past decade. First, Jordan played the hard-shelled but softhearted Wallace in HBO's dramatic hit series The Wire (2002). He then went on to star as quarterback Vince Howard on Friday Night Lights (2006) (NBC), before playing a recovering alcoholic, Alex, on NBC's Parenthood (2010).

Jordan successfully took on his first major leading film role when he starred as Oscar Grant in Fruitvale Station (2013). The film is an account of Oscar's controversial slaying by police officers on a San Francisco train platform. The cast includes Octavia Spencer and Melonie Diaz, and was produced by Forest Whitaker (Significant Films). It premiered at the 2013 Sundance Film Festival where it received the Grand Jury Prize and Audience Award for U.S. Dramatic Film. It also screened at the 2013 Cannes Film Festival in the Un Certain Regard category. The has garnered many awards including Best First Feature at the 2014 Independent Spirit Awards, Outstanding Independent Motion Picture at the 2014 NAACP Image Awards and the 2014 Stanley Kramer Award from the Producer's Guild of America. The 2013 New York Film Critics Circle honored it with Best First Film and the picture was also chosen as one of the Top Ten Films at the 2013 National Board of Review Awards, where Jordan took home the award for Breakthrough Actor. Jordan also won the 2013 Gotham Award for Breakthrough Actor and was nominated for an Independent Spirit Award for Best Lead Actor.

In 2015, Jordan starred in Josh Trank's Fantastic Four (2015), playing the role of 'Johnny Storm' aka 'The Human Torch', opposite Miles Teller, Jamie Bell, and Kate Mara for 20th Century Fox. The film was released on August 7th 2015. Jordan previously starred in 20th Century Fox's box office hit Chronicle (2012) (which was also directed by Trank), a supernatural thriller that follows three Portland teens (MBJ, Dane Dehaan, and Alex Russell) as they develop incredible powers after exposure to a mysterious substance; That Awkward Moment (2015) opposite Zac Efron and Miles Teller for Focus Films; and the George Lucas produced film Red Tails (2012), the story of the first African American pilots to fly in a combat squadron during WWII aka The Tuskegee Airmen.

Jordan reunited with Ryan Coogler for Creed (2015), starring alongside Sylvester Stallone and Tessa Thompson. The film was released on Thanksgiving 2015 by MGM and Warner Brothers. A devoted fan of comic books growing up, Jordan starred as the villain, Eric Killmonger, in the 2018 box office smash Black Panther (2018). In 2018, he is also starring as Guy Montag in the HBO adaptation of Ray Bradbury's science fiction classic Fahrenheit 451 (2018).

He resides in Los Angeles, where he supports the charity Lupus LA."


"""

displacy.render(nlp(t[124:194]), style="ent")

In [222]:
t[124:194]

' the son of Donna (Davis), a high school counselor, and Michael A. Jor'

## Education

In [29]:
input = """
"Henry William Dalgliesh Cavill was born on the Bailiwick of Jersey, a British Crown dependency in the Channel Islands. His mother, Marianne (Dalgliesh), a housewife, was also born on Jersey, and is of Irish, Scottish and English ancestry. Henry's father, Colin Richard Cavill, a stockbroker, is of English origin (born in Chester, England). Henry is the second youngest son, with four brothers. He was privately educated at St. Michael's Preparatory School in Saint Saviour, Jersey before attending Stowe School in Buckinghamshire, England.

His interest in acting started at an early age with school play renditions of Shakespeare's "A Midsummer Night's Dream", and Sonny LaTierri in "Grease". He also starred and directed Shakespeare's "Hamlet" in the BBC documentary "40 Minutes". It was at age 17 when Henry was discovered by casting directors at school who were looking for a young boy to play Albert Mondego in The Count of Monte Cristo (2002). He went on to star in Laguna (2001), appear in BBC's The Inspector Lynley Mysteries (2001), the television film Goodbye, Mr. Chips (2002), and the television series Midsomer Murders (1997).

When Henry was 20 years old, he gained starring roles in I Capture the Castle (2003), Hellraiser: Hellworld (2005), Red Riding Hood (2006) and Tristan & Isolde (2006). He also had a minor role in the fantasy-adventure epic Stardust (2007) alongside Sienna Miller and Ben Barnes. During 2007-2010, Henry had a leading role on the television series The Tudors (2007) as Charles Brandon, 1st Duke of Suffolk. The series was a success and was nominated for a Golden Globe Award in 2007 and won an Emmy Award in 2008. Entertainment Weekly named him "Most Dashing Duke".

He also starred in Blood Creek (2006) and Woody Allen's comedy film Whatever Works (2009). On January 30, 2011, it was announced that Henry Cavill had been cast as the next Superman in Man of Steel (2013), making him the first non-American actor to play Superman. The movie was directed by Zach Snyder, produced by Christopher Nolan, and scripted by David S. Goyer. On November 7, 2011, Henry starred in Tarsem Singh's fantasy-adventure epic Immortals (2011) alongside Mickey Rourke, Freida Pinto and Luke Evans. On September 7, 2012, Henry starred in the action-thriller Cold Light of Day (2003) alongside Bruce Willis and Sigourney Weaver.

On June 10, 2013, Man of Steel (2013) kicked off its world premiere in New York City followed by London, Bailiwick of Jersey, Sicily, Madrid, Shanghai, Sydney and Tokyo. The movie became the highest-grossing Superman film to date, and the second-highest-grossing reboot of all time behind The Amazing Spider-Man (2012). Glamour magazine ranked him the #1 "Sexiest Man". In August 2014, Henry became the Ambassador for Durrell Wildlife Park and created a website and social media called #CavillConservation to help raise funds and awareness for his love of animals and conservation. On November 3, 2014, it was announced that Cavill, his brother Charlie, and London-based producer Rex Glensy, have formed their own British production company, Promethean Productions.

On August 7, 2015, The Man from U.N.C.L.E. (2015) began its premiere tour with a people's premiere at the famous Somerset House in London, followed by its world premiere in New York City, then Toronto, and Rio de Janeiro. Cavill reprised his role as Superman in Batman v Superman: Dawn of Justice (2016) and Justice League (2017).
"


"""

text = nlp(input)
sentences = list(text.sents)

pattern = [{'ENT_TYPE': 'ORG', 'OP': '+'}]
education_matcher = Matcher(nlp.vocab)
education_matcher.add("education",None,pattern)
syntactic_matches = []
for sentence in sentences:
    t_nlp = nlp(sentence.text)
    if re.search(r'university|college|school|academy|husband', sentence.text.lower()):
        syntactic_matches.append([t_nlp[s:e].text for _, s, e in education_matcher(t_nlp)])
# for org in syntactic_matches:
#     if org != []:
#         syntactic_matches.append(sorted(org,key=len)[-1])

In [35]:
for org in syntactic_matches:
    o = sorted(org, key=len)
    if o!=[]:
        print(o)

["'s", 'St.', 'Stowe', 'School', 'School', 'Michael', "Michael's", 'St. Michael', 'Preparatory', 'Stowe School', "St. Michael's", "'s Preparatory", 'Preparatory School', "Michael's Preparatory", "'s Preparatory School", "St. Michael's Preparatory", "Michael's Preparatory School", "St. Michael's Preparatory School"]
['of', 'The', 'Count', 'Monte', 'Cristo', 'Count of', 'of Monte', 'The Count', 'The Count of', 'Monte Cristo', 'Count of Monte', 'of Monte Cristo', 'The Count of Monte', 'Count of Monte Cristo', 'The Count of Monte Cristo']


## movie

In [49]:
from collections import namedtuple
Film = namedtuple('Film',['title','year'])


In [60]:
re_pattern = r"([A-Z][\w' .:-]{1,25})\((\d{4})\)"
film_list =[]
for match in re.finditer(re_pattern, input):
    film_list.append(
        Film(match.group(1),match.group(2))
    )
sorted(film_list,key =lambda x:x.year)

[Film(title='Midsomer Murders ', year='1997'),
 Film(title='Laguna ', year='2001'),
 Film(title='Lynley Mysteries ', year='2001'),
 Film(title='The Count of Monte Cristo ', year='2002'),
 Film(title='Mr. Chips ', year='2002'),
 Film(title='I Capture the Castle ', year='2003'),
 Film(title='Cold Light of Day ', year='2003'),
 Film(title='Hellraiser: Hellworld ', year='2005'),
 Film(title='Red Riding Hood ', year='2006'),
 Film(title='Isolde ', year='2006'),
 Film(title='Blood Creek ', year='2006'),
 Film(title='Stardust ', year='2007'),
 Film(title='The Tudors ', year='2007'),
 Film(title='Whatever Works ', year='2009'),
 Film(title='Immortals ', year='2011'),
 Film(title='The Amazing Spider-Man ', year='2012'),
 Film(title='Superman in Man of Steel ', year='2013'),
 Film(title='Man of Steel ', year='2013'),
 Film(title='The Man from U.N.C.L.E. ', year='2015'),
 Film(title='Superman: Dawn of Justice ', year='2016'),
 Film(title='Justice League ', year='2017')]

In [59]:
film_list[0].year

'2002'

## Debut
### lexical
https://www.imdb.com/name/nm0000553/bio?ref_=nm_ov_bio_sm

In [63]:
input ="""

Liam Neeson was born on June 7, 1952 in Ballymena, Northern Ireland, to Katherine (Brown), a cook, and Bernard Neeson, a school caretaker. He was raised in a Catholic household. During his early years, Liam worked as a forklift operator for Guinness, a truck driver, an assistant architect and an amateur boxer. He had originally sought a career as a teacher by attending St. Mary's Teaching College, Newcastle. However, in 1976, Neeson joined the Belfast Lyric Players' Theater and made his professional acting debut in the play "The Risen People". After two years, Neeson moved to Dublin's Abbey Theater where he performed the classics. It was here that he was spotted by director John Boorman and was cast in the film Excalibur (1981) as Sir Gawain, his first high-profile film role.

Through the 1980s Neeson appeared in a handful of films and British TV series - including The Bounty (1984), A Woman of Substance (1984), The Mission (1986), and Duet for One (1986) - but it was not until he moved to Hollywood to pursue larger roles that he began to get noticed. His turn as a mute homeless man in Suspect (1987) garnered good reviews, as did supporting roles in The Good Mother (1988) and High Spirits (1988) - though he also starred in the best-to-be-forgotten Satisfaction (1988), which also featured a then-unknown Julia Roberts - but leading man status eluded him until the cult favorite Darkman (1990), directed by Sam Raimi. From there, Neeson starred in Under Suspicion (1991) and Ethan Frome (1993), was hailed for his performance in Woody Allen's Husbands and Wives (1992), and ultimately was picked by Steven Spielberg to play Oskar Schindler in Schindler's List (1993). The starring role in the Oscar-winning Holocaust film brought Neeson Academy Award, BAFTA and Golden Globe nominations for Best Actor.

Also in 1993, he made his Broadway debut with a Tony-nominated performance in "Anna Christie", in which he co-starred with his future wife Natasha Richardson. The next year, the two also starred opposite Jodie Foster in the movie Nell (1994), and were married in July of that year. Leading roles as the 18th century Scottish Highlander Rob Roy (1995) and the Irish revolutionary leader Michael Collins (1996) followed, and soon Neeson was solidified as one of Hollywood's top leading men. He starred in the highly-anticipated Star Wars: Episode I - The Phantom Menace (1999) as Qui-Gon Jinn, received a Golden Globe nomination for Kinsey (2004), played the mysterious Ducard in Christopher Nolan's Batman Begins (2005), and provided the voice for Aslan in The Chronicles of Narnia: The Lion, the Witch and the Wardrobe (2005).

"""

In [69]:
re_pattern = r"debut.*?([\w '-]+\(\d{4}\))"
debut_match = []
for match in re.finditer(re_pattern,input):
    debut_match.append(match.group(1))
debut_match = debut_match[0] 
debut_match

' It was here that he was spotted by director John Boorman and was cast in the film Excalibur (1981)'

In [71]:
def starin_syntactic(input):
    """
    identical to starin_lexical
    """
    re_pattern = r"([A-Z][\w' .:-]{1,25})\((\d{4})\)"
    film_list =[]
    for match in re.finditer(re_pattern, input):
        film_list.append(
            Film(match.group(1),match.group(2))
        )
    film_list = sorted(film_list, key=lambda x: x.year)
    export = [film.title.strip() for film in film_list]

    return export

starin_syntactic(debut_match)

['Excalibur']

## export

In [72]:
export_file_path = "./Ziheng_Gong_hw02_cast.jl"
export_dict = {"a": "a","b":"b"}
with open(export_file_path,"w+") as f:
    f.write(str(export_dict))
    f.write('\n')