# 1. Knowledge Graphs - Data Preprocessing

## 1.1. Load Data

In [None]:
import pandas as pd

In [None]:
#Mounting my personal google drive where the dataset is stored so that I can use it directly in Colab
from google.colab import drive

drive.mount('/content/gdrive'

In [None]:
#Load the dataset obtained from Kaggle
movie_plots_data = pd.read_csv('/content/gdrive/MyDrive/Shopify Application/wiki_movie_plots_deduped.csv')
print(movie_plots_data.shape)

(34886, 8)


## 1.2. Select Subset

In [None]:
movie_plots_data_selection = movie_plots_data[movie_plots_data['Release Year'] >= 2005]

In [None]:
plots = movie_plots_data_selection.Plot

In [None]:
plots.head()

14591    Andy Stitzer is a 40-year-old virgin who lives...
14592    51 Birch Street is the first-person account of...
14593    Max is a lonely child in the suburbs of Austin...
14594    In 2011, a deadly pathogenic virus has killed ...
14595    In 1974, Ronald DeFeo Jr. murdered his family ...
Name: Plot, dtype: object

## 1.3. Split Movie Plots Into Phrases

In [None]:
phrases = []
for plot in plots.head(1000):
    phrases_all = plot.split('.')
    phrases_filtered = [p.strip() for p in phrases_all if p.strip() and len(p.strip()) > 3]
    phrases += phrases_filtered

In [None]:
!pip install spacy
!pip install textacy


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textacy
  Downloading textacy-0.12.0-py3-none-any.whl (208 kB)
[K     |████████████████████████████████| 208 kB 7.4 MB/s 
Collecting cytoolz>=0.10.1
  Downloading cytoolz-0.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 52.8 MB/s 
Collecting jellyfish>=0.8.0
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 42.9 MB/s 
[?25hCollecting pyphen>=0.10.0
  Downloading pyphen-0.13.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 54.4 MB/s 
Building wheels for collected packages: jellyfish
  Building wheel for jellyfish (setup.py) ... [?25l[?25hdone
  Created wheel for jellyfish: filename=jellyfish-0.9.0-cp38-cp38-linux_x86_64.whl size=70

In [None]:
from textacy.extract import subject_verb_object_triples



In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp('They are watching a movie')
iterator = subject_verb_object_triples(doc)
for it in iterator:
    print(it)

SVOTriple(subject=[They], verb=[are, watching], object=[movie])


## 1.4. Extract SVO Triples

In [None]:
from tqdm import tqdm

In [None]:
triples_raw = []

for p in tqdm(phrases):
    iterator = subject_verb_object_triples(nlp(p))
    phrase_triples = []
    for svo in iterator:
        phrase_triples.append(svo)
    triples_raw.append(phrase_triples)

100%|██████████| 28127/28127 [04:55<00:00, 95.20it/s]


## 1.5. Lemmatize & Stem SVO Triples

In [None]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus.reader.wordnet import VERB

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_triples = []
lemmatized_stemmed_triples = []

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
def lemmatize_stem(text):
    # lemmatize + stem a text
    return stemmer.stem(WordNetLemmatizer().lemmatize(text))

In [None]:
phrase_counter = 0
found_triples_counter = 0

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
for (phrase, phrase_triples) in zip(phrases, triples_raw):
    print('phrase triples:', phrase_triples)
    if len(phrase) > 0:
        phrase_counter += 1
    if phrase_triples:
        found_triples_counter += 1
        for triple in phrase_triples:
            print('raw triples:', triple)
            # lemmatize
            s = lemmatizer.lemmatize(str(triple[0]))
            o = lemmatizer.lemmatize(str(triple[2]))
            v = lemmatizer.lemmatize(str(triple[1][-1]), VERB)  # take last token in verb expression
            
            # lemmatize & stem
            lss = lemmatize_stem(str(triple[0]))
            lso = lemmatize_stem(str(triple[2]))
            lsv = lemmatize_stem(str(triple[1][-1]))
            
            print('lemmatized: s:', s, ', o:', o, ', v:', v)
            print('lemmatized&stemmed: s:', lss, ', o:', lso, ', v:', lsv)
            
            lemmatized_triples.append([s.lower(), v.lower(), o.lower()])
            lemmatized_stemmed_triples.append([lss.lower(), lsv.lower(), lso.lower()])
    else:
        print('Could not detect triples in phrase:', phrase)
        lemmatized_triples.append([])
        lemmatized_stemmed_triples.append([])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
phrase triples: [SVOTriple(subject=[Roscoe], verb=[is, greeted], object=[parents, Roscoe, Sr])]
raw triples: SVOTriple(subject=[Roscoe], verb=[is, greeted], object=[parents, Roscoe, Sr])
lemmatized: s: [Roscoe] , o: [parents, Roscoe, Sr] , v: greet
lemmatized&stemmed: s: [roscoe] , o: [parents, roscoe, sr] , v: greet
phrase triples: []
Could not detect triples in phrase: (James Earl Jones) and Mama Jenkins (Margaret Avery); his brother, Otis (Michael Clarke Duncan), the town sheriff; Otis' wife, Ruthie (Liz Mikel) and their overgrown kids, Junior (Brandin Jenkins) and Callie (Krystal Marea Braud); and Roscoe's loud, rowdy sister, Betty (Mo'Nique)
phrase triples: [SVOTriple(subject=[he], verb=[is, escorting], object=[Lucinda])]
raw triples: SVOTriple(subject=[he], verb=[is, escorting], object=[Lucinda])
lemmatized: s: [he] , o: [Lucinda] , v: escort
lemmatized&stemmed: s: [he] , o: [lucinda] , v: escort
phrase triples: [SV

In [None]:
print('Triples Detection Rate:', (found_triples_counter / phrase_c))