In [9]:
import re
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from os import environ
environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell'

%matplotlib inline

import findspark
findspark.init()

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

DATA_DIR = 'data/'

# Subtitle data

In [10]:
#Read the subtitle file and transform it into a DataFrame
df = sqlContext.read.format('com.databricks.spark.xml').options(rootTag='document',rowTag='s').option("valueTag", "content").load(DATA_DIR+'6653249.xml')
df.printSchema()

root
 |-- _emphasis: boolean (nullable = true)
 |-- _id: long (nullable = true)
 |-- time: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- _value: string (nullable = true)
 |    |    |-- content: string (nullable = true)
 |-- w: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _emphasis: boolean (nullable = true)
 |    |    |-- _id: double (nullable = true)
 |    |    |-- content: string (nullable = true)



In [11]:
#We select only time and w columns, keeping only the (word) value of w
data = df.select(col('time._value').alias('time'),explode('w.content').alias('word'))
data.show()

+--------------------+--------------+
|                time|          word|
+--------------------+--------------+
|[00:01:40,634, 00...|      Terminal|
|[00:01:40,634, 00...|             A|
|[00:01:40,634, 00...|            of|
|[00:01:40,634, 00...|        Boston|
|[00:01:40,634, 00...| international|
|[00:01:40,634, 00...|       airport|
|[00:01:40,634, 00...|             .|
|[00:01:43,670, 00...|Transportation|
|[00:01:43,670, 00...|       between|
|[00:01:43,670, 00...|     terminals|
|[00:01:43,670, 00...|           ...|
|[00:01:53,612, 00...|          Take|
|[00:01:53,612, 00...|          your|
|[00:01:53,612, 00...|        laptop|
|[00:01:53,612, 00...|           out|
|[00:01:53,612, 00...|            of|
|[00:01:53,612, 00...|          your|
|[00:01:53,612, 00...|           bag|
|[00:01:53,612, 00...|             .|
|[00:02:14,264, 00...|           The|
+--------------------+--------------+
only showing top 20 rows



In [12]:
#We keep the starting time of the subtitle only
clean_data = data.withColumn('startingTime',data['time'].getItem(0)).select(col('startingTime'), col('word'))
clean_data.show()

+------------+--------------+
|startingTime|          word|
+------------+--------------+
|00:01:40,634|      Terminal|
|00:01:40,634|             A|
|00:01:40,634|            of|
|00:01:40,634|        Boston|
|00:01:40,634| international|
|00:01:40,634|       airport|
|00:01:40,634|             .|
|00:01:43,670|Transportation|
|00:01:43,670|       between|
|00:01:43,670|     terminals|
|00:01:43,670|           ...|
|00:01:53,612|          Take|
|00:01:53,612|          your|
|00:01:53,612|        laptop|
|00:01:53,612|           out|
|00:01:53,612|            of|
|00:01:53,612|          your|
|00:01:53,612|           bag|
|00:01:53,612|             .|
|00:02:14,264|           The|
+------------+--------------+
only showing top 20 rows



In [14]:
#We filter out the stop words and punctuation
#We use the stop-words list from NLTK
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import string
import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')

stopWords = set(stopwords.words('english'))
alphabet = list(string.ascii_lowercase)

#Checks whether a word is a stop-word or a sequence of non-alphabetic characters and sets them to None
def isAWord(x):
    if(len(x)==0 or x.lower() in stopWords or not any(c.isalpha() for c in x)):
        return None
    else:
        return x
    
udf_is_a_word = udf(isAWord, StringType())

newData = clean_data.select(('startingTime'),udf_is_a_word('word').alias('word'))
cleanData = newData.na.drop(subset=["word"])
cleanData.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
+------------+--------------+
|startingTime|          word|
+------------+--------------+
|00:01:40,634|      Terminal|
|00:01:40,634|        Boston|
|00:01:40,634| international|
|00:01:40,634|       airport|
|00:01:43,670|Transportation|
|00:01:43,670|     terminals|
|00:01:53,612|          Take|
|00:01:53,612|        laptop|
|00:01:53,612|           bag|
|00:02:14,264|        Boston|
|00:02:14,264|        police|
|00:02:14,264|    department|
|00:02:14,264|      requests|
|00:02:14,264|        safety|
|00:02:14,264|      security|
|00:02:14,264|           per|
|00:02:14,264|           FAA|
|00:02:14,264|   regulations|
|00:02:41,656|           'll|
|00:02:41,656|          home|
+------------+--------------+
only showing top 20 rows



### Metadata

In [15]:
from pyspark.sql.types import DateType

In [16]:
metadata = sqlContext.read.format('com.databricks.spark.xml').options(rootTag='metadata',rowTag='subtitle').load(DATA_DIR+'6653249.xml')
metadata.show()

+------+---+----------+----------+------------+--------+
|blocks|cds|confidence|      date|    duration|language|
+------+---+----------+----------+------------+--------+
|   870|1/1|       1.0|2016-06-10|01:32:10,143| English|
+------+---+----------+----------+------------+--------+



We'll only use the date of the subtitles therefore we define a method to just get this one single value as a string, which we'll later process according to our needs

In [17]:
def getDate(filename):
    metadata = sqlContext.read.format('com.databricks.spark.xml').options(rootTag='metadata',rowTag='subtitle').load(DATA_DIR+filename)
    date = metadata.select('date').head()[0]
    return date

In [18]:
getDate('6653249.xml')

'2016-06-10'

# IMDB Data 

We'll be using four of the IMDB datasets: 
    1. basics - for various metadata about the movie/tv series
    2. episode - to match the subtitles tv series episodes with the overall series in the database
    3. principals - for the character name info
    4. ratings - for the ratings and views info
All of them are described here: https://www.imdb.com/interfaces/
All of the datasets fit on our local machines, however we'll need to use the cluster when joining this data with the subtitle files

In [19]:
#Loading datasets

basics = spark.read.csv(DATA_DIR+'title.basics.tsv',sep='\t',header=True)
episode = spark.read.csv(DATA_DIR+'title.episode.tsv',sep='\t',header=True)
principals = spark.read.csv(DATA_DIR+'title.principals.tsv',sep='\t',header=True)
ratings = spark.read.csv(DATA_DIR+'title.ratings.tsv',sep='\t',header=True)

We only need the following fields from every table:
    - basics:
        - all fields withouth isAdult, originalTitle
    - episode:
        - all fields
    - principals:
        - tconst, ordering, nconst, characters
    - ratings:
        - all fields

In [20]:
basics.select('titleType').distinct().show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
+------------+



We're not interested in the videoGame, tvShort or short types fo we get rid of them

In [41]:
#remove the originalTitle and isAdult columns
basics_clean = basics.drop('isAdult', 'originalTitle').filter(col('titleType')!= 'short').filter(col('titleType')!= 'videoGame').filter(col('titleType')!= 'tvShort')

#remove the rows where the category of a person is not 'actor' and where the character name is empty
#remove the category and job columns, since category is always 'actor' now
principals_clean = principals.drop('category','job').filter(col('category')!='actor').filter(col('characters') != '\\N')

#remove episodes with empty series or episode numbers
episode_clean = episode.filter(col('seasonNumber')!='\\N').filter(col('episodeNumber')!='\\N')
ratings_clean = ratings

In [42]:
#Note that we leave the endYear column because it is valid for the TV Series, e.x.
basics_clean.filter(col('primaryTitle')=='Twin Peaks').show()

+---------+---------+------------+---------+-------+--------------+--------------------+
|   tconst|titleType|primaryTitle|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+------------+---------+-------+--------------+--------------------+
|tt0098936| tvSeries|  Twin Peaks|     1990|   1991|            47| Crime,Drama,Mystery|
|tt0186641|    video|  Twin Peaks|     1993|     \N|            \N|               Adult|
|tt2395641|tvEpisode|  Twin Peaks|     2006|     \N|            \N|         Documentary|
|tt2650780|tvEpisode|  Twin Peaks|     2013|     \N|            44|Adventure,Reality-TV|
|tt3225942|tvEpisode|  Twin Peaks|     2013|     \N|            43|          Reality-TV|
|tt3421958|tvEpisode|  Twin Peaks|     2010|     \N|            \N|         Drama,Short|
|tt4093826| tvSeries|  Twin Peaks|     2017|   2017|            60| Crime,Drama,Fantasy|
+---------+---------+------------+---------+-------+--------------+--------------------+



In [43]:
basics_clean.show()

+---------+---------+--------------------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+---------+-------+--------------+--------------------+
|tt0000009|    movie|          Miss Jerry|     1894|     \N|            45|             Romance|
|tt0000147|    movie|The Corbett-Fitzs...|     1897|     \N|            20|Documentary,News,...|
|tt0000335|    movie|Soldiers of the C...|     1900|     \N|            \N|     Biography,Drama|
|tt0000502|    movie|            Bohemios|     1905|     \N|           100|                  \N|
|tt0000574|    movie|The Story of the ...|     1906|     \N|            70|Biography,Crime,D...|
|tt0000615|    movie|  Robbery Under Arms|     1907|     \N|            \N|               Drama|
|tt0000630|    movie|              Hamlet|     1908|     \N|            \N|               Drama|
|tt0000675|    movie|         

In [44]:
principals_clean.show()

+---------+--------+---------+--------------------+
|   tconst|ordering|   nconst|          characters|
+---------+--------+---------+--------------------+
|tt0000001|       1|nm1588970|         ["Herself"]|
|tt0000009|       1|nm0063086|["Miss Geraldine ...|
|tt0000009|       3|nm1309758|["Himself - the D...|
|tt0000012|       1|nm2880396|         ["Herself"]|
|tt0000012|       2|nm9735580|         ["Himself"]|
|tt0000012|       3|nm0525900|         ["Herself"]|
|tt0000012|       4|nm9735581|         ["Herself"]|
|tt0000012|       7|nm9735579|         ["Herself"]|
|tt0000012|       8|nm9653419|         ["Herself"]|
|tt0000013|       1|nm0525908|         ["Himself"]|
|tt0000013|       2|nm1715062|         ["Himself"]|
|tt0000016|       1|nm0525900|["Herself (on the...|
|tt0000016|       2|nm9735581|["Herself (on the...|
|tt0000017|       2|nm3692829|        ["The girl"]|
|tt0000024|       1|nm0256651|["Herself - Empre...|
|tt0000024|       2|nm0435118|["Himself - Emper...|
|tt0000028| 

In [45]:
episode_clean.show()

+---------+------------+------------+-------------+
|   tconst|parentTconst|seasonNumber|episodeNumber|
+---------+------------+------------+-------------+
|tt0041951|   tt0041038|           1|            9|
|tt0042816|   tt0989125|           1|           17|
|tt0043426|   tt0040051|           3|           42|
|tt0043631|   tt0989125|           2|           16|
|tt0043693|   tt0989125|           2|            8|
|tt0043710|   tt0989125|           3|            3|
|tt0044093|   tt0959862|           1|            6|
|tt0044901|   tt0989125|           3|           46|
|tt0045519|   tt0989125|           4|           11|
|tt0045960|   tt0044284|           2|            3|
|tt0046135|   tt0989125|           4|            5|
|tt0046855|   tt0046643|           1|            4|
|tt0046864|   tt0989125|           5|           20|
|tt0047810|   tt0914702|           3|           36|
|tt0047852|   tt0047745|           1|           15|
|tt0047858|   tt0046637|           2|            9|
|tt0047961| 

In [39]:
ratings_clean.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.8|    1440|
|tt0000002|          6.3|     172|
|tt0000003|          6.6|    1041|
|tt0000004|          6.4|     102|
|tt0000005|          6.2|    1735|
|tt0000006|          5.5|      91|
|tt0000007|          5.5|     579|
|tt0000008|          5.6|    1539|
|tt0000009|          5.6|      74|
|tt0000010|          6.9|    5127|
|tt0000011|          5.4|     214|
|tt0000012|          7.4|    8599|
|tt0000013|          5.7|    1318|
|tt0000014|          7.2|    3739|
|tt0000015|          6.2|     660|
|tt0000016|          5.9|     982|
|tt0000017|          4.8|     197|
|tt0000018|          5.5|     414|
|tt0000019|          6.6|      13|
|tt0000020|          5.1|     232|
+---------+-------------+--------+
only showing top 20 rows



# Names, cities, words and themes

We'll be using the Datamuse API (http://www.datamuse.com/api/) to get the word frequency in order to spot well know or unusual words as well as words related to a given topic (a theme in our case). We define two functions to ease this task.

In [94]:
import requests

#returns a frequency of a word per 1 000 000 words of English text
def frequency(word):
    call = 'https://api.datamuse.com/words?sp=%s&qe=sp&md=fr'%word
    response = requests.get(call)
    data = response.json()
    return data.pop()['tags'][2]

#returns 5 words related to the word
def topic(word):
    call = 'https://api.datamuse.com/words?topics=%s'%word
    response = requests.get(call)
    data = response.json()
    result = []
    for row in data[:5]:
        result.append(row['word'])
    return result

f:289.446847
['bloom', 'blossom', 'efflorescence', 'peak', 'flush']


In [95]:
print(frequency('word'))

f:289.446847


In [96]:
print(topic('flower'))

['bloom', 'blossom', 'efflorescence', 'peak', 'flush']


As for the cities, we'll use the geotext library that will help us filter out the cities from the subitle file

In [102]:
from geotext import GeoText

def isCity(word):
    places = GeoText(word)
    return len(places.cities)>0

In [103]:
isCity("London")

True

In [104]:
isCity("asdfghj")

False

And for parsing names of the characters properly we will use the nameparser

In [106]:
from nameparser import HumanName

def firstName(inputName):
    name = HumanName(inputName)
    return name['first']

In [108]:
print(firstName("Jules David Winnifield III"))

Jules
