In [1]:
import re

import pandas as pd
import neuralcoref
import spacy

import nltk
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
 

In [2]:
# baca file dataset
df = pd.read_csv('movies_genres.csv', sep='\t', nrows=100)

# aku kepikirannya bikin model buat setiap genre, jadi labelnya dipisah
label_action = df['Action']
label_adult = df['Adult']
label_adventure = df['Adventure']
label_animation = df['Animation']
label_biography = df['Biography']
label_comedy = df['Comedy']
label_crime = df['Crime']
label_documentary = df['Documentary']
label_drama = df['Drama']
label_family = df['Family']
label_fantasy = df['Fantasy']
label_gameshow = df['Game-Show']
label_history = df['History']
label_horror = df['Horror']
label_lifestyle = df['Lifestyle']
label_music = df['Music']
label_musical = df['Musical']
label_mystery = df['Mystery']
label_news = df['News']
label_reality = df['Reality-TV']
label_romance = df['Romance']
label_scifi = df['Sci-Fi']
label_short = df['Short']
label_sport = df['Sport']
label_talkshow = df['Talk-Show']
label_thriller = df['Thriller']
label_war = df['War']
label_western = df['Western']

df = df.drop(columns=['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Lifestyle', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'])
df.head()

Unnamed: 0,title,plot
0,"""#7DaysLater"" (2013)",#7dayslater is an interactive comedy series f...
1,"""#BlackLove"" (2015) {Crash the Party (#1.9)}","With just one week left in the workshops, the..."
2,"""#BlackLove"" (2015) {Making Lemonade Out of Le...",All of the women start making strides towards...
3,"""#BlackLove"" (2015) {Miss Independent (#1.5)}",All five of these women are independent and s...
4,"""#BlackLove"" (2015) {Sealing the Deal (#1.10)}",Despite having gone through a life changing p...


In [None]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# coreference resolution

# experiment
text = """

Walt Disney was born on December 5, 1901. Disney became one of the best-known motion picture producers in the world. He is particularly noted for being a film producer and a popular showman, as well as an innovator in animation and theme park design.

Disney is famous for his contributions in the field of entertainment during the 20th century. His first success was through the series, Oswald the Lucky Rabbit which was created by the Disney studio for Charles Mintz of Universal Studios. When Disney asked for a larger budget for his popular Oswald series, Mintz refused and Disney had to quit. Later, Disney and his brother Roy O. Disney started from scratch and co-founded Walt Disney Productions, now known as The Walt Disney Company. Today, this company has annual revenues of approximately U.S. $35 billion. This success is largely due to a number of the world's most famous fictional characters he and his staff created including Mickey Mouse, a character for which Disney himself was the original voice.

Disney won 26 Academy Awards out of 59 nominations, including a record four in one year, giving him more awards and nominations than any other individual. He is also the namesake for Disneyland and Walt Disney World Resort theme parks in the United States, as well as the international resorts in Japan, France, and China.

Disney died of lung cancer in Burbank, California, on December 15, 1966. The following year, construction began on Walt Disney World Resort in Florida. His brother Roy Disney inaugurated The Magic Kingdom on October 1, 1971.

"""

neuralcoref.add_to_pipe(nlp, greedyness=0.45, max_dist=40, max_dist_match=500)
doc = nlp(text)

print(doc._.coref_clusters)
print(doc._.coref_resolved)

# pre-processing
def coref_resolution(dataframe, nlp):
    for i in range(0, len(dataframe)):
        doc = nlp(dataframe['plot'][i])
        dataframe.at[i, 'plot'] = doc._.coref_resolved
    
coref_resolution(df, nlp)

[

Walt Disney: [

Walt Disney, Disney, Disney, Disney, Disney, Disney, Disney, Disney, Disney himself, Disney, Disney], He: [He, his, His, his], Charles Mintz: [Charles Mintz, Mintz, his], The Walt Disney Company: [The Walt Disney Company, this company], his: [his, him, He]]


Walt Disney was born on December 5, 1901. 

Walt Disney became one of the best-known motion picture producers in the world. He is particularly noted for being a film producer and a popular showman, as well as an innovator in animation and theme park design.



Walt Disney is famous for He contributions in the field of entertainment during the 20th century. He first success was through the series, Oswald the Lucky Rabbit which was created by the 

Walt Disney studio for Charles Mintz of Universal Studios. When 

Walt Disney asked for a larger budget for He popular Oswald series, Charles Mintz refused and 

Walt Disney had to quit. Later, 

Walt Disney and Charles Mintz brother Roy O. Disney started from scratch a

In [None]:
# tokenization

def tokenize(text):
    """
    text: kalimat yang bakal ditokenize
    contoh: `token = tokenize(token_text)`
    """
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    result = text.split(' ')
    return list(filter(lambda word: word != '', result))

# token = tokenize(token_text)
# print(token)

In [4]:
# lemmatization
df.head()

Unnamed: 0,title,plot
0,"""#7DaysLater"" (2013)",#7dayslater is an interactive comedy series f...
1,"""#BlackLove"" (2015) {Crash the Party (#1.9)}","With just one week left in the workshops, the..."
2,"""#BlackLove"" (2015) {Making Lemonade Out of Le...",All of the women start making strides towards...
3,"""#BlackLove"" (2015) {Miss Independent (#1.5)}",All five of these women are independent and s...
4,"""#BlackLove"" (2015) {Sealing the Deal (#1.10)}",Despite having gone through a life changing p...


In [5]:
# pos tag
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

tagged_sentences = nltk.corpus.treebank.tagged_sents()

# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))   # 2935
print(len(test_sentences))    # 979
 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)

clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])
 
clf.fit(X[:10000], y[:10000])   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)
 
print('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

2935
979
Training completed
Accuracy: 0.8949461674433553


In [6]:
#for ner
for column in df[['plot']]:
   # Select column contents by column name using [] operator
   columnSeriesObj = df[column]
   print(columnSeriesObj.values)

[" #7dayslater is an interactive comedy series featuring an ensemble cast of YouTube celebrities. Each week the audience writes the brief via social media for an all-new episode featuring a well-known guest-star. Seven days later that week's episode premieres on TV and across multiple platforms."
 ' With just one week left in the workshops, the women consider the idea of "The One." The ladies are stunned when Jahmil finally comes to a decision about Bentley and if Bentley\'s the one for her. Jack challenges Tennesha to express her feelings of love towards Errol, but can her put her out there and face possible rejection?'
 " All of the women start making strides towards finding  All of the women own version of a happy ending. Tennesha and Errol decide to become exclusive, but Laree just isn't ready to tell Karl Laree loves Karl, even though Karl has expressed that sentiment to Laree. Cynthia finds it hard to venture out on Cynthia own after Cynthia tumultuous separation, and Monet final

 " At the importing firm of Ventry and Bayle, John Ventry has apparently committed suicide. Mr. Bayle recalls in flashbacks how auditors found that John Ventry has been guilty of embezzling from the firm's bank accounts. But Mr. Bayle is about to learn that partnerships are not so easily dissolved."]


In [7]:
# ner
import numpy as np
from tqdm import tqdm, trange
import unicodedata
 
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, Dropout, Bidirectional
 
# Defining Constants
 
# Maximum length of text sentences
MAXLEN = 180
# Number of LSTM units
LSTM_N = 150
# batch size
BS=48

data = pd.read_csv("ner_train.csv", encoding="latin1")
test_data = pd.read_csv("ner_test.csv", encoding="latin1")

print("Number of uniques docs, sentences and words in Training set:\n",data.nunique())
print("\nNumber of uniques docs, sentences and words in Test set:\n",test_data.nunique())
 
# Creating a vocabulary
words = list(set(data["Word"].append(test_data["Word"]).values))
words.append("ENDPAD")
 
# Converting greek characters to ASCII characters eg. 'naïve café' to 'naive cafe'
words = [unicodedata.normalize('NFKD', str(w)).encode('ascii','ignore') for w in words]
n_words = len(words)
print("\nLength of vocabulary = ",n_words)
 
tags = list(set(data["tag"].values))
n_tags = len(tags)
print("\nnumber of tags = ",n_tags)
 
# Creating words to indices dictionary.
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}

def get_tagged_sentences(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["tag"].values.tolist())]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
 
def get_test_sentences(data):
    agg_func = lambda s: [w for w in s["Word"].values.tolist()]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences

# Getting training sentences in a list
sentences = get_tagged_sentences(data)
print("First 2 sentences in a word list format:\n",sentences[0:2])


Using TensorFlow backend.


Number of uniques docs, sentences and words in Training set:
 Sent_ID     7816
Word       10986
tag           25
dtype: int64

Number of uniques docs, sentences and words in Test set:
 Sent_ID    1953
Word       5786
tag          25
dtype: int64

Length of vocabulary =  12146

number of tags =  25
First 2 sentences in a word list format:
 [[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')], [('liza', 'B-Actor'), ('minnelli', 'I-Actor'), ('and', 'O'), ('joel', 'B-Actor'), ('gray', 'I-Actor'), ('won', 'B-Award'), ('oscars', 'I-Award'), ('for', 'O'), ('their', 'O'), ('roles', 'O'), ('in', 'O'), ('this', 'O'), ('1972', 'B-Year'), ('movie', 'O'), ('that', 'B-Plot'), ('follows', 'I-Plot'),

In [8]:
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in train set eg. 'naïve café' to 'naive cafe'
X = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in sentences]
 
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in test-set eg. 'naïve café' to 'naive cafe'
# X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
# encode('ascii','ignore')] for w in s] for s in test_sentences]
 
'''
Padding train and test sentences to 180 words.
Sentences of length greater than 180 words are truncated.
Sentences of length less than 180 words are padded with a high value.
'''
X = pad_sequences(maxlen=MAXLEN, sequences=X, padding="post", value=n_words - 1)
# X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)
 
# Converting tags to indices for test sentences (labels)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Padding tag labels to 180 words.
y = pad_sequences(maxlen=MAXLEN, sequences=y, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y = [to_categorical(i, num_classes=n_tags) for i in y]



# 180 dimensional word indices as input
input = Input(shape=(MAXLEN,))
 
# Embedding layer of same length output (180 dim embedding will be generated)
model = Embedding(input_dim=n_words, output_dim=MAXLEN, input_length=MAXLEN)(input)
 
# Adding dropout layer
model = Dropout(0.2)(model)
 
# Bidirectional LSTM to learn from both forward as well as backward context
model = Bidirectional(LSTM(units=LSTM_N, return_sequences=True, recurrent_dropout=0.1))(model)
 
# Adding a TimeDistributedDense, to applying a Dense layer on each 180 timesteps
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer
model = Model(input, out)
 
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X, np.array(y), batch_size=BS, epochs=1, validation_split=0.5, verbose=1)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 3908 samples, validate on 3908 samples
Epoch 1/1


In [13]:
# print(test_data)
# print()
test_sentences = get_test_sentences(test_data)
# print(type(test_sentences))
test_sentences1 = [['woody','lost','in','the','woods'], ['aliens','invading','earth']]
# test_sentences2 = [['i', 'need', 'that', 'movie', 'which', 'involves', 'aliens', 'invading', 'earth', 'in', 'a', 'particular', 'united', 'states', 'place', 'in', 'california'], ['what', 'soviet', 'science', 'fiction', 'classic', 'about', 'a', 'mysterious', 'planet', 'was', 'later', 'remade', 'by', 'steven', 'soderbergh', 'and', 'george', 'clooney']]
# print(len(test_sentences1))
# print(len(test_sentences2))
print("First 2 sentences in a word list format:\n",test_sentences1[0:2])

X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
encode('ascii','ignore')] for w in s] for s in test_sentences1]

X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)

First 2 sentences in a word list format:
 [['woody', 'lost', 'in', 'the', 'woods'], ['aliens', 'invading', 'earth']]


In [14]:
# Predicting on trained model
pred = model.predict(X_test)
print("Predicted Probabilities on Test Set:\n",pred.shape)
# taking tag class with maximum probability
pred_index = np.argmax(pred, axis=-1)
print("Predicted tag indices: \n",pred_index.shape)

# Flatten both the features and predicted tags for submission
ids,tagids = X_test.flatten().tolist(), pred_index.flatten().tolist()

# converting each word indices back to words
words_test = [words[ind].decode('utf-8') for ind in ids]
# converting each predicted tag indices back to tags
tags_test = [tags[ind] for ind in tagids]
print("Length of words in Padded test set:",len(words_test))
print("Length of tags in Padded test set:",len(tags_test))
print("\nCheck few of words and predicted tags:\n",words_test[:10],tags_test[:10])


Predicted Probabilities on Test Set:
 (2, 180, 25)
Predicted tag indices: 
 (2, 180)
Length of words in Padded test set: 360
Length of tags in Padded test set: 360

Check few of words and predicted tags:
 ['woody', 'lost', 'in', 'the', 'woods', 'ENDPAD', 'ENDPAD', 'ENDPAD', 'ENDPAD', 'ENDPAD'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [11]:
# event extraction

In [12]:
# main program (learning)