In [1]:
import re

import pandas as pd
import neuralcoref
import spacy

import nltk
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
 

In [2]:
# baca file dataset
df = pd.read_csv('movies_genres.csv', sep='\t', nrows=100)

# aku kepikirannya bikin model buat setiap genre, jadi labelnya dipisah
label_action = df['Action']
label_adult = df['Adult']
label_adventure = df['Adventure']
label_animation = df['Animation']
label_biography = df['Biography']
label_comedy = df['Comedy']
label_crime = df['Crime']
label_documentary = df['Documentary']
label_drama = df['Drama']
label_family = df['Family']
label_fantasy = df['Fantasy']
label_gameshow = df['Game-Show']
label_history = df['History']
label_horror = df['Horror']
label_lifestyle = df['Lifestyle']
label_music = df['Music']
label_musical = df['Musical']
label_mystery = df['Mystery']
label_news = df['News']
label_reality = df['Reality-TV']
label_romance = df['Romance']
label_scifi = df['Sci-Fi']
label_short = df['Short']
label_sport = df['Sport']
label_talkshow = df['Talk-Show']
label_thriller = df['Thriller']
label_war = df['War']
label_western = df['Western']

df = df.drop(columns=['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Game-Show', 'History', 'Horror', 'Lifestyle', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'])
df.head()

Unnamed: 0,title,plot
0,"""#7DaysLater"" (2013)",#7dayslater is an interactive comedy series f...
1,"""#BlackLove"" (2015) {Crash the Party (#1.9)}","With just one week left in the workshops, the..."
2,"""#BlackLove"" (2015) {Making Lemonade Out of Le...",All of the women start making strides towards...
3,"""#BlackLove"" (2015) {Miss Independent (#1.5)}",All five of these women are independent and s...
4,"""#BlackLove"" (2015) {Sealing the Deal (#1.10)}",Despite having gone through a life changing p...


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
# coreference resolution

# experiment
text = """

Walt Disney was born on December 5, 1901. Disney became one of the best-known motion picture producers in the world. He is particularly noted for being a film producer and a popular showman, as well as an innovator in animation and theme park design.

Disney is famous for his contributions in the field of entertainment during the 20th century. His first success was through the series, Oswald the Lucky Rabbit which was created by the Disney studio for Charles Mintz of Universal Studios. When Disney asked for a larger budget for his popular Oswald series, Mintz refused and Disney had to quit. Later, Disney and his brother Roy O. Disney started from scratch and co-founded Walt Disney Productions, now known as The Walt Disney Company. Today, this company has annual revenues of approximately U.S. $35 billion. This success is largely due to a number of the world's most famous fictional characters he and his staff created including Mickey Mouse, a character for which Disney himself was the original voice.

Disney won 26 Academy Awards out of 59 nominations, including a record four in one year, giving him more awards and nominations than any other individual. He is also the namesake for Disneyland and Walt Disney World Resort theme parks in the United States, as well as the international resorts in Japan, France, and China.

Disney died of lung cancer in Burbank, California, on December 15, 1966. The following year, construction began on Walt Disney World Resort in Florida. His brother Roy Disney inaugurated The Magic Kingdom on October 1, 1971.

"""

neuralcoref.add_to_pipe(nlp, greedyness=0.45, max_dist=40, max_dist_match=500)
doc = nlp(text)

print(doc._.coref_clusters)
print(doc._.coref_resolved)

# pre-processing
def coref_resolution(dataframe, nlp):
    for i in range(0, len(dataframe)):
        doc = nlp(dataframe['plot'][i])
        dataframe.at[i, 'plot'] = doc._.coref_resolved
    
coref_resolution(df, nlp)

[

Walt Disney: [

Walt Disney, Disney, Disney, Disney, Disney, Disney, Disney, Disney, Disney himself, Disney, Disney], He: [He, his, His, his], Charles Mintz: [Charles Mintz, Mintz, his], The Walt Disney Company: [The Walt Disney Company, this company], his: [his, him, He]]


Walt Disney was born on December 5, 1901. 

Walt Disney became one of the best-known motion picture producers in the world. He is particularly noted for being a film producer and a popular showman, as well as an innovator in animation and theme park design.



Walt Disney is famous for He contributions in the field of entertainment during the 20th century. He first success was through the series, Oswald the Lucky Rabbit which was created by the 

Walt Disney studio for Charles Mintz of Universal Studios. When 

Walt Disney asked for a larger budget for He popular Oswald series, Charles Mintz refused and 

Walt Disney had to quit. Later, 

Walt Disney and Charles Mintz brother Roy O. Disney started from scratch a

In [5]:
# tokenization

def tokenize(text):
    """
    text: kalimat yang bakal ditokenize
    contoh: `token = tokenize(token_text)`
    """
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    result = text.split(' ')
    return list(filter(lambda word: word != '', result))

# token = tokenize(token_text)
# print(token)

In [6]:
# lemmatization

def lemmatization(sentence):
    """
    sentence: kalimat yang akan di-lemma, belum 
    """
    sentence = nlp(sentence)
    result = []
    for word in sentence:
        if len(word.lemma_) != 1 and word.lemma_ != "-PRON-":
            result.append(word.lemma_)
    return result


lemma_list = []         # isinya list of lemma
lemma_list_lower = []   # isinya list of lemma (all lowercase)


for idx in range(len(df)):
    text = df[['plot']].iloc[idx]['plot']
    lemma = lemmatization(text)
    lemma_list.append(lemma)
    lemma_list_lower.append([word.lower() for word in lemma])

In [7]:
# pos tag
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'prefix-4': sentence[index][:4],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-4:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'prev_word_2': '' if index < 1 else sentence[index - 2],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'next_word_2': '' if index >= len(sentence) - 2 else sentence[index + 2],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

tagged_sentences = nltk.corpus.treebank.tagged_sents()

# Split the dataset for training and testing
cutoff = int(.75 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))   # 2935
print(len(test_sentences))    # 979
 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
     
    return X, y
 
X, y = transform_to_dataset(training_sentences)

print(set(y))
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier())
])
 
clf.fit(X[:10000], y[:10000])

print('Training completed')
 
X_test, y_test = transform_to_dataset(test_sentences)
 
print("Accuracy:", clf.score(X_test, y_test))

2935
979
{':', '``', 'VBZ', 'PRP', 'NNPS', '$', 'POS', 'MD', 'FW', 'NN', 'DT', 'UH', 'CD', 'IN', 'TO', 'JJR', '#', 'VB', 'VBD', 'JJ', 'NNP', '-RRB-', 'VBG', 'PDT', 'PRP$', 'EX', 'RP', 'NNS', 'CC', '-LRB-', 'LS', 'SYM', 'WDT', '.', ',', 'WP$', "''", 'RBR', 'JJS', 'VBN', 'RB', 'WRB', 'RBS', '-NONE-', 'WP', 'VBP'}
Training completed
Accuracy: 0.8954282500401736


In [8]:
import csv
# print(lemma_list_lower)
sent_id = 10000

train_data = []
for sentence in lemma_list_lower:
    print(len(sentence))
    for word in sentence:
        temp_train_data = []
        temp_train_data.append(sent_id)
        temp_train_data.append(word)
        temp_train_data.append('Annonymous')
        train_data.append(temp_train_data)
    
    sent_id += 1
# print(train_data)
new_df=pd.DataFrame(train_data,columns=['Sent_ID','Word','tag'])
new_df.tail()

47
54
76
97
48
46
43
105
48
94
50
56
65
66
50
220
58
73
50
53
53
123
109
103
74
90
89
239
48
104
46
103
98
109
68
182
105
93
116
138
87
125
74
97
83
105
106
81
96
136
101
77
94
116
97
74
112
83
105
94
87
100
78
114
190
103
79
99
89
83
95
74
100
52
142
109
92
99
115
101
68
95
104
118
64
95
100
40
66
60
67
58
46
80
44
48
49
92
75
50


Unnamed: 0,Sent_ID,Word,tag
8847,10099,be,Annonymous
8848,10099,not,Annonymous
8849,10099,so,Annonymous
8850,10099,easily,Annonymous
8851,10099,dissolve,Annonymous


In [9]:
# ner
import numpy as np
from tqdm import tqdm, trange
import unicodedata
 
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense
from keras.layers import TimeDistributed, Dropout, Bidirectional
 
# Defining Constants
 
# Maximum length of text sentences
MAXLEN = 300
# Number of LSTM units
LSTM_N = 150
# batch size
BS=48

data = pd.read_csv("ner_train.csv", encoding="latin1")
data = data.append(new_df)

test_data = pd.read_csv("ner_test.csv", encoding="latin1")

print("Number of uniques docs, sentences and words in Training set:\n",data.nunique())
print("\nNumber of uniques docs, sentences and words in Test set:\n",test_data.nunique())
 
# Creating a vocabulary
words = list(set(data["Word"].append(test_data["Word"]).values))
# for item in all_vocab:
#     words.append(item)
# # words.append('marco')
words = list(set(words))
words.append("ENDPAD")
 
# Converting greek characters to ASCII characters eg. 'naïve café' to 'naive cafe'
words = [unicodedata.normalize('NFKD', str(w)).encode('ascii','ignore') for w in words]
n_words = len(words)
print("\nLength of vocabulary = ",n_words)
 
tags = list(set(data["tag"].values))
n_tags = len(tags)
print("\nnumber of tags = ",n_tags)
 
# Creating words to indices dictionary.
word2idx = {w: i for i, w in enumerate(words)}
# Creating tags to indices dictionary.
tag2idx = {t: i for i, t in enumerate(tags)}

def get_tagged_sentences(data):
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["tag"].values.tolist())]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences
 
def get_test_sentences(data):
    agg_func = lambda s: [w for w in s["Word"].values.tolist()]
    grouped = data.groupby("Sent_ID").apply(agg_func)
    sentences = [s for s in grouped]
    return sentences

# Getting training sentences in a list
sentences = get_tagged_sentences(data)
print("First 2 sentences in a word list format:\n",sentences[0:2])

data.tail()

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Number of uniques docs, sentences and words in Training set:
 Sent_ID     7916
Word       11578
tag           26
dtype: int64

Number of uniques docs, sentences and words in Test set:
 Sent_ID    1953
Word       5786
tag          25
dtype: int64

Length of vocabulary =  12698

number of tags =  26
First 2 sentences in a word list format:
 [[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')], [('liza', 'B-Actor'), ('minnelli', 'I-Actor'), ('and', 'O'), ('joel', 'B-Actor'), ('gray', 'I-Actor'), ('won', 'B-Award'), ('oscars', 'I-Award'), ('for', 'O'), ('their', 'O'), ('roles', 'O'), ('in', 'O'), ('this', 'O'), ('1972', 'B-Year'), ('movie', 'O'), ('that', 'B-Plot'), ('follows', 'I-Plot'),

Unnamed: 0,Sent_ID,Word,tag
8847,10099,be,Annonymous
8848,10099,not,Annonymous
8849,10099,so,Annonymous
8850,10099,easily,Annonymous
8851,10099,dissolve,Annonymous


In [10]:
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in train set eg. 'naïve café' to 'naive cafe'
X = [[word2idx[unicodedata.normalize('NFKD', str(w[0])).
encode('ascii','ignore')] for w in s] for s in sentences]
 
# Converting words to indices for test sentences (Features)
# Converting greek characters to ASCII characters in test-set eg. 'naïve café' to 'naive cafe'
# X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
# encode('ascii','ignore')] for w in s] for s in test_sentences]
 
'''
Padding train and test sentences to 180 words.
Sentences of length greater than 180 words are truncated.
Sentences of length less than 180 words are padded with a high value.
'''
X = pad_sequences(maxlen=MAXLEN, sequences=X, padding="post", value=n_words - 1)
# X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)
 
# Converting tags to indices for test sentences (labels)
y = [[tag2idx[w[1]] for w in s] for s in sentences]
# Padding tag labels to 180 words.
y = pad_sequences(maxlen=MAXLEN, sequences=y, padding="post", value=tag2idx["O"])
 
# Making labels in one hot encoded form for DL model
y = [to_categorical(i, num_classes=n_tags) for i in y]



# 180 dimensional word indices as input
input = Input(shape=(MAXLEN,))
 
# Embedding layer of same length output (180 dim embedding will be generated)
model = Embedding(input_dim=n_words, output_dim=MAXLEN, input_length=MAXLEN)(input)
 
# Adding dropout layer
model = Dropout(0.2)(model)
 
# Bidirectional LSTM to learn from both forward as well as backward context
model = Bidirectional(LSTM(units=LSTM_N, return_sequences=True, recurrent_dropout=0.1))(model)
 
# Adding a TimeDistributedDense, to applying a Dense layer on each 180 timesteps
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer
model = Model(input, out)
 
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X, np.array(y), batch_size=BS, epochs=2, validation_split=0.05, verbose=1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 7520 samples, validate on 396 samples
Epoch 1/2
Epoch 2/2


In [11]:
# print(test_data)
# print()
test_sentences = get_test_sentences(test_data)
# print(type(test_sentences))
test_sentences1 = lemma_list_lower
# test_sentences2 = [['i', 'need', 'that', 'movie', 'which', 'involves', 'aliens', 'invading', 'earth', 'in', 'a', 'particular', 'united', 'states', 'place', 'in', 'california'], ['what', 'soviet', 'science', 'fiction', 'classic', 'about', 'a', 'mysterious', 'planet', 'was', 'later', 'remade', 'by', 'steven', 'soderbergh', 'and', 'george', 'clooney']]
# print(len(test_sentences1))
# print(len(test_sentences2))
print("First 2 sentences in a word list format:\n",test_sentences1[0:2])

X_test = [[word2idx[unicodedata.normalize('NFKD', str(w)).
encode('ascii','ignore')] for w in s] for s in test_sentences1]

X_test = pad_sequences(maxlen=MAXLEN, sequences=X_test, padding="post", value=n_words - 1)

First 2 sentences in a word list format:
 [['7dayslater', 'be', 'an', 'interactive', 'comedy', 'series', 'feature', 'an', 'ensemble', 'cast', 'of', 'youtube', 'celebrity', 'each', 'week', 'the', 'audience', 'write', 'the', 'brief', 'via', 'social', 'medium', 'for', 'an', 'all', 'new', 'episode', 'feature', 'well', 'know', 'guest', 'star', 'seven', 'day', 'later', 'that', 'week', "'s", 'episode', 'premiere', 'on', 'tv', 'and', 'across', 'multiple', 'platform'], ['with', 'just', 'one', 'week', 'leave', 'in', 'the', 'workshop', 'the', 'woman', 'consider', 'the', 'idea', 'of', 'the', 'one', 'the', 'lady', 'be', 'stun', 'when', 'jahmil', 'finally', 'come', 'to', 'decision', 'about', 'bentley', 'and', 'if', 'bentley', "'s", 'the', 'one', 'for', 'jack', 'challenge', 'tennesha', 'to', 'express', 'feeling', 'of', 'love', 'towards', 'errol', 'but', 'can', 'put', 'out', 'there', 'and', 'face', 'possible', 'rejection']]


In [12]:
# Predicting on trained model
pred = model.predict(X_test)
print("Predicted Probabilities on Test Set:\n",pred.shape)
# taking tag class with maximum probability
pred_index = np.argmax(pred, axis=-1)
print("Predicted tag indices: \n",pred_index.shape)

# Flatten both the features and predicted tags for submission
ids,tagids = X_test.flatten().tolist(), pred_index.flatten().tolist()

# converting each word indices back to words
words_test = [words[ind].decode('utf-8') for ind in ids]
# converting each predicted tag indices back to tags
tags_test = [tags[ind] for ind in tagids]
print("Length of words in Padded test set:",len(words_test))
print("Length of tags in Padded test set:",len(tags_test))
print("\nCheck few of words and predicted tags:\n",words_test[:10],tags_test[:10])


Predicted Probabilities on Test Set:
 (100, 300, 26)
Predicted tag indices: 
 (100, 300)
Length of words in Padded test set: 30000
Length of tags in Padded test set: 30000

Check few of words and predicted tags:
 ['7dayslater', 'be', 'an', 'interactive', 'comedy', 'series', 'feature', 'an', 'ensemble', 'cast'] ['O', 'O', 'O', 'O', 'B-Actor', 'O', 'O', 'O', 'O', 'O']


In [13]:
ner_plot = []
temp_ner_plot = []
for i in range(len(words_test)-1):
    if words_test[i] != 'ENDPAD':
        temp_ner_plot.append(tags_test[i])
        if words_test[i+1] == 'ENDPAD':
            ner_plot.append(temp_ner_plot)
    if words_test[i] == 'ENDPAD':
        if words_test[i+1] != 'ENDPAD':
            temp_ner_plot = []
print(ner_plot)

[['O', 'O', 'O', 'O', 'B-Actor', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'I-Plot', 'I-Plot', 'O', 'O', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot'], ['O', 'O', 'O', 'I-Plot', 'I-Plot', 'I-Plot', 'O', 'O', 'I-Plot', 'O', 'O', 'O', 'I-Plot', 'I-Plot', 'O', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-Plot', 'I-

In [14]:
# event extraction
import json

# semantic role labeling
df_sentence = []
df_word = []
df_verb = []
df_tag = []
df_pos_bef = []
df_pos_aft = []
df_pos_tag = []
df_first = []
df_last = []
df_is_verb = []
df_is_passive = []

with open('srl_dataset.txt') as json_file:
    data = json.load(json_file)

    for line in data :
        words = line['words']
        if len(line['verbs']) > 0 :
            verb = line['verbs'][0]

            data_train = []
            tokenize_word = []
            for i in range (len(verb['tags'])) :
                if i == 0 :
                    df_first.append(True)
                else :
                    df_first.append(False)
                    
                if i == len(verb['tags']) - 1 :
                    df_last.append(True)
                else :
                    df_last.append(False)

                df_word.append(words[i])
                df_verb.append(verb['verb'])
                df_tag.append(verb['tags'][i])
                data_train.append(features(words, i))

            pos_tag = clf.predict(data_train)
            df_pos_tag.extend(pos_tag)

            is_verb = False
            is_passive = False
            for i in range (len(pos_tag)) :
                if pos_tag[i].startswith('V') :
                    is_verb = True
                    if pos_tag[i].startswith('VBN') :
                        if i != len(pos_tag) - 1 :
                            if pos_tag[i + 1].startswith('VBD') :
                                is_passive = True
                    
                df_is_verb.append(is_verb)
                
                if i != 0 :
                    df_pos_bef.append(pos_tag[i - 1])
                else :
                    df_pos_bef.append('')

                if i != len(pos_tag) - 1 :
                    df_pos_aft.append(pos_tag[i + 1])
                else :
                    df_pos_aft.append('')
                    
            df_is_passive.extend([is_passive] * len(pos_tag))

    d = {
        'word' : df_word, 
#         'verb' : df_verb, 
        'pos' : df_pos_tag,
        'pos_bef' : df_pos_bef,
        'pos_aft' : df_pos_aft,
        'first' : df_first,
        'is_verb' : df_is_verb,
        'is_passive' : df_is_passive,
#         'last' : df_last,
        'tag' : df_tag,
    }

df_srl = pd.DataFrame(data=d)

print(df_srl)

df_srl.to_csv("result.csv", index=False)


         word  pos pos_bef pos_aft  first  is_verb  is_passive         tag
0         The   DT              NN   True    False       False      B-ARG1
1       birch   NN      DT      NN  False    False       False      I-ARG1
2       canoe   NN      NN     VBD  False    False       False      I-ARG1
3        slid  VBD      NN      IN  False     True       False         B-V
4          on   IN     VBD      DT  False     True       False  B-ARGM-LOC
...       ...  ...     ...     ...    ...      ...         ...         ...
6400     bell   IN      DT       ,  False    False       False      I-ARG1
6401        ,    ,      IN      NN  False    False       False           O
6402     come   NN       ,      RB  False    False       False           O
6403  quickly   RB      NN       .  False    False       False           O
6404        .    .      RB          False    False       False           O

[6405 rows x 8 columns]


In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import json

clf2 = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

X = df_srl.iloc[:,:-1]
y = df_srl['tag']

data_train = json.loads(X.to_json(orient='records'))
clf2.fit(data_train, y)


#     print(sentences)
def srl_predict(sentences) :
    df_word = []
    df_verb = []
    df_pos_bef = []
    df_pos_aft = []
    df_pos_tag = []
    df_first = []
    df_last = []
    df_is_verb = []
    df_is_passive = []

    for sentence in sentences :
        words = tokenize(sentence)
        data_train = []
        for i in range (len(words)) :
            if i == 0 :
                df_first.append(True)
            else :
                df_first.append(False)
                    
            if i == len(verb['tags']) - 1 :
                df_last.append(True)
            else :
                df_last.append(False)

            df_word.append(words[i])
            data_train.append(features(words, i))

        pos_tag = clf.predict(data_train)
        df_pos_tag.extend(pos_tag)

        is_verb = False
        is_passive = False
        for i in range (len(pos_tag)) :
            if pos_tag[i].startswith('V') :
                is_verb = True
                if pos_tag[i].startswith('VBN') :
                    if i != len(pos_tag) - 1 :
                        if pos_tag[i + 1].startswith('VBD') :
                            is_passive = True

            df_is_verb.append(is_verb)

            if i != 0 :
                df_pos_bef.append(pos_tag[i - 1])
            else :
                df_pos_bef.append('')

            if i != len(pos_tag) - 1 :
                df_pos_aft.append(pos_tag[i + 1])
            else :
                df_pos_aft.append('')

        df_is_passive.extend([is_passive] * len(pos_tag))

    d = {
        'word' : df_word, 
        'pos' : df_pos_tag,
        'pos_bef' : df_pos_bef,
        'pos_aft' : df_pos_aft,
        'first' : df_first,
        'is_verb' : df_is_verb,
        'is_passive' : df_is_passive,
#         'last' : df_last,
    }
    
#     print(df_word)

    df = pd.DataFrame(data=d)
    data_test = json.loads(df.to_json(orient='records'))
    
    return clf2.predict(data_test)


# example
tes = ['the fish']
print(srl_predict(tes))



['B-ARG0' 'O']


In [20]:
srl_plot = []
for p in df['plot']:
    srl_plot.append(srl_predict([p]))
print(srl_plot)

[array(['B-ARG1', 'B-V', 'B-ARG2', 'I-ARG1', 'I-ARG2', 'I-ARG1', 'I-ARG1',
       'I-ARG1', 'I-ARG1', 'I-ARG2', 'I-ARG1', 'I-ARG1', 'O', 'I-ARG1',
       'O', 'I-ARG2', 'I-ARG1', 'B-V', 'B-ARG2', 'I-ARG1', 'O', 'I-ARG2',
       'I-ARG2', 'B-ARG3', 'O', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1',
       'O', 'I-ARG2', 'O', 'O', 'I-ARG1', 'I-ARG2', 'O', 'I-ARG2',
       'I-ARG1', 'I-ARG2', 'I-ARG2', 'I-ARG1', 'I-ARG2', 'B-ARGM-TMP',
       'I-ARGM-LOC', 'I-ARG2', 'O', 'I-ARG1', 'O'], dtype=object), array(['B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'B-V', 'O', 'I-ARG2', 'B-V',
       'B-ARG2', 'I-ARG1', 'B-ARGM-DIR', 'I-ARG2', 'I-ARG1', 'I-ARG1',
       'I-ARGM-LOC', 'O', 'I-ARG2', 'O', 'B-V', 'O', 'O', 'O', 'I-ARG2',
       'O', 'B-ARG2', 'I-ARG2', 'I-ARG1', 'B-ARGM-TMP', 'I-ARGM-LOC',
       'I-ARGM-MNR', 'O', 'I-ARGM-LOC', 'I-ARG0', 'B-ARG1', 'I-ARG1',
       'I-ARG2', 'O', 'I-ARG2', 'O', 'O', 'B-ARGM-PRP', 'O', 'I-ARG2',
       'B-ARG1', 'I-ARG0', 'I-ARG2', 'I-ARG1', 'I-ARG2', 'I-ARG1', 'B-V',


In [21]:
#Feature selection
for i in range(len(df['plot'])):
    text = df.at[i,'plot']
    words = tokenize(text)
    for j in range(len(word)):
        if (j < len(ner_plot) and j < len(srl_plot)):
            if ner_plot[i][j] == 'O' and srl_plot[i][j] == 'O':
                words.remove(words[j])
    df.at[i,'plot'] = ' '.join(words)
    print(len(words))
    print(len(ner_plot[i]))
    print(len(srl_plot[i]))

48
47
48
58
54
59
79
76
79
107
97
107
48
48
49
56
46
56
47
43
47
111
105
111
52
48
52
99
94
100
55
50
55
56
56
56
66
65
68
73
66
73
52
50
53
221
220
223
62
58
65
85
73
85
53
50
53
60
53
63
60
53
63
139
123
139
117
109
117
112
103
112
71
74
77
97
90
98
100
89
100
243
239
245
59
48
60
105
104
106
53
46
54
103
103
103
102
98
105
114
109
115
69
68
71
187
182
191
105
105
107
90
93
91
121
116
121
142
138
143
88
87
90
130
125
132
73
74
76
102
97
102
82
83
84
106
105
109
107
106
109
82
81
86
97
96
99
136
136
137
106
101
108
79
77
82
97
94
102
123
116
123
99
97
99
75
74
77
112
112
115
81
83
87
111
105
111
97
94
98
84
87
87
102
100
102
86
78
87
118
114
118
195
190
198
115
103
115
81
79
84
101
99
101
94
89
94
85
83
86
101
95
101
81
74
81
106
100
108
53
52
53
151
142
151
113
109
114
101
92
101
103
99
103
124
115
124
108
101
108
70
68
73
101
95
101
111
104
114
122
118
123
67
64
67
99
95
100
109
100
109
43
40
43
70
66
71
69
60
69
70
67
72
64
58
64
43
46
47
86
80
86
45
44
46
52
48
53
49
49
49
94
92
9

In [22]:
# main program (learning)
data_df = pd.read_csv("movies_genres.csv", delimiter='\t')
data_df = data_df.drop(['Lifestyle'], axis=1)
genres = list(data_df.drop(['title', 'plot'], axis=1).columns.values)

 # split the data
data_x = data_df[['plot']].as_matrix()
data_y = data_df.drop(['title', 'plot'], axis=1).as_matrix()

x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size=0.33, random_state=42)

# transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0].strip() for x in x_train.tolist()]
test_x = [x[0].strip() for x in x_test.tolist()]

  import sys
  


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

classifier = Pipeline([
                ('tfidf', TfidfVectorizer(max_df = 0.25, ngram_range=(1,2))),
                ('clf', OneVsRestClassifier(LinearSVC(C=1, class_weight='balanced'), n_jobs=1)),
            ])

classifier.fit(train_x, y_train)
predictions = classifier.predict(test_x)
print(classification_report(y_test, predictions, target_names=genres))



              precision    recall  f1-score   support

      Action       0.80      0.69      0.74      4118
       Adult       1.00      0.37      0.54        19
   Adventure       0.78      0.63      0.70      3372
   Animation       0.84      0.77      0.80      3713
   Biography       0.82      0.12      0.22       451
      Comedy       0.80      0.77      0.79     11159
       Crime       0.79      0.79      0.79      5050
 Documentary       0.72      0.70      0.71      3967
       Drama       0.82      0.86      0.84     15196
      Family       0.79      0.69      0.74      5139
     Fantasy       0.79      0.59      0.68      2334
   Game-Show       0.83      0.75      0.79       680
     History       0.71      0.44      0.54       865
      Horror       0.79      0.31      0.44       856
       Music       0.85      0.61      0.71       941
     Musical       0.93      0.27      0.42       204
     Mystery       0.71      0.70      0.70      3967
        News       0.86    

  'precision', 'predicted', average, warn_for)
