In [16]:
import pandas as pd
import numpy as np
import spacy
import random
import train
import re
from spacy.gold import GoldParse
from spacy.scorer import Scorer

# Cleaning Dataset

In [2]:
df = pd.read_csv('input/#jokermovie.csv')

In [3]:
dfText = df[['id', 'text']]

In [4]:
dfText.head()

Unnamed: 0,id,text
0,1180633752885743616,People at the cinema thinking they got the #Jo...
1,1180633751744913408,I have a new favorite joker @jokermovie #Joker...
2,1180633723932483584,"⭐⭐⭐⭐⭐ Performance, what a movie #JokerMovie pi..."
3,1180633642198077441,#JokerMovie was absolutely amazing and ended w...
4,1180633577693880321,I just want to give both Arthur Fleck and Gary...


# Create Training Data
Characters, Talent, plot, soundtrack, tickets, hype, movie

In [5]:
categories = {
    'CHARACTERS': ['character', 'characters', 'Arthur Fleck', 'Murray Franklin', 'Sophie Dumond', 'Penny Fleck', 'mother', 'mum'],
    'TALENT': ['actor', 'actress', 'actors', 'talent', 'director', 'Joaquin Phoenix', '#JoaquinPhoenix', 'Robert De Niro', 'Zazie Beetz', 'Frances Conroy', 'Todd Philips'],
    'PLOT': ['plot', 'frame', 'scene', 'scenes'],
    'TICKETS': ['ticket', 'tickets', 'presale'],
    'SOUNDTRACK': ['music', 'song'],
    'SPECIAL EFFECTS': ['FX', 'visual effects', 'graphics', 'postproduction'],
    'COSTUME DESIGN': ['costum', 'costumes', 'costume designer'],
    'HYPE': ['hype', 'hyyyype', 'amazing', 'awesome', 'desire', 'desires', 'premiere', 'Cant wait'],
    'BOX OFFICE': ['box office'],
    'REVIEWS': ['review', 'reviews'],
    'TRAILER':['trailer', 'trailers']
}

In [6]:
TRAINING_DATA = [
    (
        "People at the cinema thinking they got the #JokerMovie  plot like: pic.twitter.com/FULbcu3SBj",
        {"entities": [(56, 60, "PLOT")]},),
     (
        "I just want to give both Arthur Fleck and Gary a hug. #JokerMovie #Joker #JokerFilm",
        {"entities": [(25, 37, "CHARACTERS")]},
    ),
    
    (
        "These 3 actors BETTER get nominated or even win an Oscar. #AvengersEndgame #JokerMovie #ITChapterTwopic.twitter.com/Sd9Ele81wz",
        {"entities": [(8, 14, "TALENT")]},
    )
    ]

In [7]:
training = train.trainData(dfText, categories)
print(training)



# Creating new spaCy model

In [8]:

# Create a blank 'en' model
nlp = spacy.blank('en')

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

# Add the labels to the entity recognizer
for category in categories.keys():
    ner.add_label(category)


# Training spaCy model

In [13]:
other_pipe = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# Start the training
nlp.begin_training()


for int in range(20) :
    print("Starting iteration" + str(int))
    random.shuffle(training)
    losses = {}

    for text, annotation in training:
        nlp.update([text], [annotation], drop = 0.2, losses = losses)
    print(losses)
new_model = nlp

Starting iteration0
{'ner': 17.788594183135228}
Starting iteration1
{'ner': 17.10076282525097}
Starting iteration2
{'ner': 13.572334574823463}
Starting iteration3
{'ner': 29.151873480620992}
Starting iteration4
{'ner': 7.203159916214756}
Starting iteration5
{'ner': 20.617659169680806}
Starting iteration6
{'ner': 18.60281050071481}
Starting iteration7
{'ner': 26.15234339415644}
Starting iteration8
{'ner': 13.929688162815273}
Starting iteration9
{'ner': 14.060932937200453}
Starting iteration10
{'ner': 6.762181635813507}
Starting iteration11
{'ner': 2.120291148396014}
Starting iteration12
{'ner': 5.497592325612643}
Starting iteration13
{'ner': 6.023142083105399}
Starting iteration14
{'ner': 5.915825262149584}
Starting iteration15
{'ner': 0.4790009731723835}
Starting iteration16
{'ner': 8.356150320642307}
Starting iteration17
{'ner': 4.260966975686276}
Starting iteration18
{'ner': 5.877302073159544}
Starting iteration19
{'ner': 4.294871653119064}


In [34]:
#Save model
new_model.to_disk('spacyModel')

In [None]:
#https://www.reddit.com/r/spacynlp/comments/518kbw/training_ner_model_from_scratch/

#End the training
#nlp.resume_training()

# Updating an existing model

In [None]:
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

PLOT = ['plot', 'frame', 'scene', 'scenes']
TALENT = ['actor', 'actress', 'actors', 'talent', 'director', 'Joaquin Phoenix', '#JoaquinPhoenix', 'Robert De Niro', 'Zazie Beetz', 'Frances Conroy', 'Todd Philips']

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("PLOT", None, *list(nlp.pipe(PLOT)))
matcher.add("TALENT", None, *list(nlp.pipe(TALENT)))


def plot_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="PLOT") for match_id, start, end in matches]
    return doc

def talent_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="PLOT") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe(plot_component)
nlp.add_pipe(talent)
print(nlp.pipe_names)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Bookings opened in Chennai for the movies in PVR cinemas\n#JokerMovie \n#WAR \n#SyeRaaOnOct2nd \nBook your tickets now @_PVRCinemas or @bookmyshow plot")
print([(ent.text, ent.label_) for ent in doc.ents])