In [1]:
import sys
sys.path.append('../koalas/')

In [2]:
from koalas import DataFrame

In [3]:
def download_dataset(name):
    from urllib import request
    with request.urlopen(request.Request(f'https://datasets.imdbws.com/{name}')) as response:
        if response.status == 200:
            with open(name, "wb") as f:
                f.write(response.read())

def get_dataset(name):
    import gzip
    with gzip.open(name, 'rb') as f:
        lines = f.read().decode('utf8').splitlines()
        lines = [line.split('\t') for line in lines]
        headers, rows = lines[0], lines[1:]
        return DataFrame(fields=headers, rows=rows)
    

In [4]:
df = get_dataset("title.basics.tsv.gz")

In [5]:
df[:1]

tconst    titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres           
------    --------- ------------ ------------- ------- --------- ------- -------------- ------           
tt0000001 short     Carmencita   Carmencita    0       1894      \N      1              Documentary,Short

In [6]:
(
    df
        .select('titleType', 'primaryTitle')
        .group('titleType')
        .apply('count', len, 'primaryTitle')
        .select('titleType', 'count')
        .sort('count')
        .reverse()
)

titleType    count  
---------    -----  
tvEpisode    7810007
short        957032 
movie        660071 
video        281648 
tvSeries     250750 
tvMovie      143225 
tvMiniSeries 50827  
tvSpecial    43832  
videoGame    36221  
tvShort      10039  
tvPilot      1      

In [10]:
(
    df
        .select('startYear', 'runtimeMinutes')
        .rename('startYear', 'Year')
        .rename('runtimeMinutes', 'Duration')
        .apply('ValidYear', str.isnumeric, 'Year')
        .filter('ValidYear', True)
        .apply('ValidDuration', str.isnumeric, 'Duration')
        .filter('ValidDuration', True)
        .apply('DurationMinutes', int, 'Duration')
        .select('Year', 'DurationMinutes')
        .rename('DurationMinutes', 'Duration')
        .group('Year')
        .apply('Average Duration', lambda durations: sum(durations) / len(durations), 'Duration')
        .select('Year', 'Average Duration')
        .sort('Year')
        .reverse()
)

Year Average Duration  
---- ----------------  
\N   38.68760434376388 
2029 79.0              
2028 86.0              
2027 57.0              
2026 6116.0            
2025 88.08163265306122 
2024 59.8037518037518  
2023 45.771784097201866
2022 44.70599918463774 
2021 42.56906266456029 
2020 39.77988115302417 
2019 39.841724064474896
2018 37.73977156680259 
2017 38.12672645996905 
2016 37.5088336810165  
2015 38.16544405669112 
2014 37.73515325270188 
2013 37.852185264761076
2012 39.1653910606313  
2011 40.21396760249749 
2010 40.7069697578976  
2009 42.20539664007265 
2008 44.256646610747204
2007 45.558310925206655
2006 47.564421947030645
2005 48.97963304869605 
2004 51.12196693627233 
2003 50.41621309789561 
2002 49.0319323217403  
2001 49.52461012232593 
2000 50.82041251778094 
1999 49.38884808226615 
1998 49.16050831545064 
1997 49.06465351719323 
1996 50.48432082594961 
1995 50.41390971316819 
1994 51.48293311252185 
1993 51.059949170279566
1992 52.99737325978461 
1991 52.34685054