In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import sys as sys
import dask.dataframe as dd
import dask

#http://www.imdb.com/interfaces/

In [5]:
from datetime import datetime as dt

def index_array(length):
    result = [x for x in range(length)]   
    return result

def parse_date(date):
    if str(date) == '' or date == None:
        return None
    else:
        return dt.strptime(date,'%Y-%m-%d')    

def parse_int(i, defaultValue=None):    
   # print (defaultValue)
    if ( i == None or str(i) == '' or str(i) == 'NaN' or i == np.NaN) :
        return defaultValue   
    else:
        try:
            #print ('convert', int(i))
            return int(float(i))
        except:
            return i
    
def parse_float(f):
    if str(f) == '' or f == None:
        return None
    else:
        try:
            return float(f)
        except:
            return f
    
def parse_bool(boolean):
    if str(boolean) == '' or boolean == None:
        return None
    else:        
        return boolean =='True' 
    
def split(data, delimiter=',' ):
    #print ('data: ', data)    
    if(data == '' or data == None or str(data) == None):
        return np.array([None])
    else:
        #return np.array(str(data).lower().split(delimiter))    
        lst = np.array(str(data).split(delimiter))       
        return lst    
    
def lower(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).lower() 
    
def upper(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).upper()     

def replaceNaN(data):
    if(np.isnan(data)): 
        return None
    else:
        return data
    

# Python does not have switch statment, rather use dict approach
parser = {
        'int':parse_int,
        'date':parse_date,
        'bool':parse_bool
    }

In [6]:
LESSON_DATA_FOLDER = './data/'

fileColumnMapping = {
'title.basics.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'titleType':np.dtype('S'), 
                 'primaryTitle':np.dtype('S'),
                 'originalTitle':np.dtype('S'),
                 'isAdult':np.dtype('S'),
                 'startYear':np.dtype('S'),
                 'endYear':np.dtype('S'),             
                 'runtimeMinutes':np.dtype('S'),
                 'genres':np.dtype('S')
                    },      
      'filePath':LESSON_DATA_FOLDER + 'title.basics.tsv',
      'to_replace':{
          'titleType':{'\\N':None},
          'primaryTitle':{'\\N':None},
          'originalTitle':{'\\N':None},
          'startYear':{'\\N':None},
          'endYear':{'\\N':None},
          'runtimeMinutes':{'\\N':None}
      },
     'true_values':['1'],
     'false_values':['0'],     
     'usecols':['tconst','titleType','primaryTitle','originalTitle','isAdult',
                'startYear','endYear','runtimeMinutes','genres'],
     'converters' : {
                 'primaryTitle':[{'function':lower, 'args':None}],
                 'titleType':[{'function':lower, 'args':None}],
                 'originalTitle':[{'function':lower, 'args':None}],
                 'isAdult':[{'function':parse_bool, 'args':None}],
                 'startYear':[{'function':parse_int, 'args':None}] ,
                 'endYear':[{'function':parse_int, 'args':None}]  ,
                 'runtimeMinutes':[{'function':parse_int, 'args':None}],
                 'genres': [{'function':split, 'args':(',',)}]
                 
                }
    },
'title.crew.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'directors':np.dtype('S') ,'writers':np.dtype('S')  },
      'split' : ['directors','writers'],
      'filePath':LESSON_DATA_FOLDER + 'title.crew.tsv',
      'to_replace':{
          'directors':{'\\N':None},
          'writers':{'\\N':None},          
      },
     'true_values':None,
     'false_values':None,     
     'usecols': None,
     'converters' : {                 
                 'writers':[{'function':split, 'args':(',',)}],
                 'directors': [{'function':split, 'args':(',',)}]
                }
    }, 
'title.episode.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'parentTconst':np.dtype('S'),
                 'seasonNumber':np.dtype('S'),  
                 'episodeNumber':np.dtype('S')  
                },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.episode.tsv',
      'to_replace':{
          'seasonNumber':{'\\N':None},
          'episodeNumber':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'seasonNumber':[{'function':parse_int, 'args':None}],
                 'episodeNumber': [{'function':parse_int, 'args':None}]
                }
    },
'title.principals.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'ordering':np.dtype('S'),
                 'nconst':np.dtype('S'),
                 'category':np.dtype('S'),
                 'job':np.dtype('S'),                 
                 'characters':np.dtype('S'),                 
                },
      'split' : None,
      'filePath':LESSON_DATA_FOLDER + 'title.principals.tsv',
      'to_replace':{
          'job':{'\\N':None},
          'characters':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,
      'converters' : {                 
                 'ordering':[{'function':parse_int, 'args':None}]             
                },
      'usecols': None      
    },    
'title.ratings.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'averageRating':np.float64 ,'numVotes':np.int32  },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.ratings.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'numVotes':[{'function':parse_int, 'args':None}]               
                }    
    },  
'name.basics.tsv': {
      'index_col': None, 
      'dtype' : {'nconst':np.dtype('S'),
                 'primaryName':np.dtype('S') ,
                 'birthYear':np.dtype('S')  ,
                 'deathYear':np.dtype('S'),
                 'primaryProfession':np.dtype('S'),
                 'knownForTitles':np.dtype('S')
                },      
      'filePath':LESSON_DATA_FOLDER + 'name.basics.tsv',
      'to_replace':{
          'primaryProfession':{'\\N':None},
          'knownForTitles':{'\\N':None},    
          'birthYear':{'\\N':None},    
          'deathYear':{'\\N':None},       
      },
      'true_values':None,
      'false_values':None,
      'usecols': None,    
      'converters' : {                 
             'primaryName':[{'function':lower, 'args':None}],                 
             'birthYear':[{'function':parse_int, 'args':(0,)}],                 
             'deathYear':[{'function':parse_int, 'args':(0,)}],
             'primaryProfession' :[{'function':split, 'args':(',',)}],
             'knownForTitles':[{'function':split, 'args':(',',)}],          
            } 
    },    
}

In [7]:
import pandas as pd

def getMapping(file):
    return fileColumnMapping.get(file)

def callFunction(columnData, **funDict):
    
    converFuns = funDict[columnData.name] #[{'function':split, 'args':(',')}]
    
    if(converFuns != None):
        for funSpec in converFuns:
            params = funSpec['args']
            #print ('args: ', params, ' <> ', params == None)
            if(params == None):                
                columnData = columnData.apply(funSpec['function'] )
            else:            
                columnData = columnData.apply(funSpec['function'], args=params )
            
    return columnData

#@dask.delayed(pure=True)
def readFile(file, nrows=None ):   
    
    mapping = getMapping(file)
    dtype = mapping['dtype']
    usecols = list(dtype.keys())

    df = pd.read_table(mapping['filePath'], 
                       index_col=mapping['index_col'], 
                       dtype = dtype, 
                       #na_values = ['//N'],
                       true_values= mapping['true_values'],
                       false_values= mapping['false_values'],                       
                       usecols=usecols,
                       nrows =nrows 
                      )
    df.fillna(method='pad', inplace=True)
    if(mapping['to_replace']!= None):
        df.replace(to_replace=mapping['to_replace'],method='pad', inplace=True)
    
    converters = mapping['converters']
    if(converters!= None):
        cols = list(converters.keys())
        df[cols] = df[cols].apply(callFunction, **converters)
        
    
    return df


In [26]:
names = readFile('name.basics.tsv')#.compute()
#names.head()
#ddf = dd.from_pandas(names, npartitions=2)

In [25]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8445648 entries, 0 to 8445647
Data columns (total 6 columns):
nconst               object
primaryName          object
birthYear            float64
deathYear            int64
primaryProfession    object
knownForTitles       object
dtypes: float64(1), int64(1), object(4)
memory usage: 386.6+ MB


In [12]:
crew = readFile('title.crew.tsv')#.compute()
crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4828274 entries, 0 to 4828273
Data columns (total 3 columns):
tconst       object
directors    object
writers      object
dtypes: object(3)
memory usage: 110.5+ MB


In [19]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [24]:
#title_writers = explode(crew, ['writers'], fill_value='')
#title_writers.drop('directors', axis=1, inplace=True)
#title_writers.info()
#title_writers.rename(columns={'writers': 'nconst'}, inplace=True)
title_writers.columns

Index(['tconst', 'nconst'], dtype='object')

In [23]:
#title_directors = explode(crew, ['directors'], fill_value='')
#title_directors.drop('writers', axis=1, inplace=True)
#title_directors.info()

#title_directors.rename(columns={'directors': 'nconst'}, inplace=True)
title_directors.columns

Index(['tconst', 'nconst'], dtype='object')

In [28]:
#title_directors = title_directors.merge(names, on=['nconst'])
#title_directors.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3425684 entries, 0 to 3425683
Data columns (total 7 columns):
tconst               object
nconst               object
primaryName          object
birthYear            float64
deathYear            int64
primaryProfession    object
knownForTitles       object
dtypes: float64(1), int64(1), object(5)
memory usage: 209.1+ MB


In [29]:
title_writers = title_writers.merge(names, on=['nconst'])
title_writers.head()

Unnamed: 0,tconst,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0000009,nm0085156,alexander black,1859.0,1940,"[director, writer, cinematographer]",[tt0000009]
1,tt0000036,nm0410331,washington irving,1783.0,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
2,tt0000076,nm0410331,washington irving,1783.0,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
3,tt0000108,nm0410331,washington irving,1783.0,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
4,tt0000109,nm0410331,washington irving,1783.0,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"


In [145]:
#title_writers.replace(to_replace={'knownForTitles':{np.NaN:None}},inplace=True)
#title_grouped_directors = 
def f(group):
    #print (group.count())
    
    return pd.DataFrame(group.to_dict('dict'))
    #return data.set_index('tconst')#.to_json(orient='records')
    #return data.set_index('tconst').to_dict('records')

title_grouped_directors = \
title_directors.tail(50).groupby('tconst') \
['nconst','primaryName','birthYear', 'deathYear','knownForTitles'] \
.apply(f)
#.apply(lambda data:data.to_json(orient='records'))
#.apply(lambda x:x.set_index('nconst').to_dict('dict')['primaryName'])
#title_grouped_directors.index = ['/'.join(str(i)) for i in title_grouped_directors.index]
title_grouped_directors = title_grouped_directors.reset_index()


In [146]:
title_grouped_directors
#pd.DataFrame(title_grouped_directors).to_json(orient='records')

Unnamed: 0,tconst,level_1,birthYear,deathYear,knownForTitles,nconst,primaryName
0,tt8043272,3425634,,0,[None],nm9648242,katrina mercer
1,tt8043344,3425635,,0,[tt7290998],nm7951335,ilari suhonen
2,tt8043354,3425636,,0,[None],nm9648302,sergi rodón
3,tt8043492,3425637,,0,"[tt0892769, tt0351283, tt0298148, tt0181689]",nm1185804,gaku nakatani
4,tt8043554,3425638,,0,[tt1310630],nm3279624,ane skak
5,tt8043572,3425639,,0,[None],nm9648488,luis rodriguez
6,tt8043724,3425640,,0,"[tt5914902, tt3680440]",nm6536252,jacinta owens
7,tt8043758,3425641,,0,[None],nm9648522,beppe tenti
8,tt8043758,3425642,,0,[None],nm9648521,filippo tenti
9,tt8043810,3425643,,0,"[tt4865754, tt6657132, tt2912216, tt3595744]",nm7074017,ashley davidson


In [32]:
ratings = readFile('title.ratings.tsv')
ratings.head(5)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1350
1,tt0000002,6.5,157
2,tt0000003,6.6,934
3,tt0000004,6.4,93
4,tt0000005,6.2,1622


In [37]:
titles = readFile('title.basics.tsv')
titles.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,carmencita,carmencita,False,1894.0,,1.0,"[Documentary, Short]"
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892.0,,5.0,"[Animation, Short]"
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892.0,,4.0,"[Animation, Comedy, Romance]"
3,tt0000004,short,un bon bock,un bon bock,False,1892.0,,,"[Animation, Short]"
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893.0,,1.0,[Short]


In [38]:
titles = titles.merge(ratings, on=['tconst'])
titles.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,carmencita,carmencita,False,1894.0,,1.0,"[Documentary, Short]",5.8,1350
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892.0,,5.0,"[Animation, Short]",6.5,157
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892.0,,4.0,"[Animation, Comedy, Romance]",6.6,934
3,tt0000004,short,un bon bock,un bon bock,False,1892.0,,,"[Animation, Short]",6.4,93
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893.0,,1.0,[Short],6.2,1622


In [49]:
pd.merge(titles, title_grouped_directors , on=['tconst']) 

ValueError: can not merge DataFrame with instance of type <class 'pandas.core.groupby.DataFrameGroupBy'>

In [36]:
#names.info()
#del names
'''Important '''
#names.loc[['nm0000001'],['primaryName','birthYear','deathYear','primaryProfession']].to_dict('index')


del titles

In [25]:
s1 = pd.Series({'aa':0, 'ba':1,'ca':2})
s2 = pd.Series({'ab':3, 'bb':4,'cb':5})


df = pd.DataFrame (data={
    'a':[1,2],
    'x':[s1,s2],
    'y':[['aa','ba','ca'],['ab','bb','cb']]
})
df.iloc[0:].to_json(orient='records')


'[{"a":1,"x":[0,1,2],"y":["aa","ba","ca"]},{"a":2,"x":[3,4,5],"y":["ab","bb","cb"]}]'

In [21]:
directors_info = pd.DataFrame()


In [70]:
df = pd.DataFrame([1,3.09])
df[0] = df[0].apply(np.int)
df

Unnamed: 0,0
0,1
1,3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5509644 entries, 0 to 5509643
Data columns (total 2 columns):
tconst       object
directors    object
dtypes: object(2)
memory usage: 84.1+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7792376 entries, 0 to 7792375
Data columns (total 2 columns):
tconst     object
writers    object
dtypes: object(2)
memory usage: 118.9+ MB
