In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import sys as sys

#http://www.imdb.com/interfaces/

In [67]:
from datetime import datetime as dt

def index_array(length):
    result = [x for x in range(length)]   
    return result

def parse_date(date):
    if str(date) == '' or date == None:
        return None
    else:
        return dt.strptime(date,'%Y-%m-%d')    

def parse_int(i):
    if (str(i) == '' or i == None  or str(i) == 'NaN') :
        return None   
    else:
        try:
            return int(i)
        except:
            return i
    
def parse_float(f):
    if str(f) == '' or f == None:
        return None
    else:
        try:
            return float(f)
        except:
            return f
    
def parse_bool(boolean):
    if str(boolean) == '' or boolean == None:
        return None
    else:        
        return boolean =='True' 
    
def split(data, delimiter=',' ):
    #print ('data: ', data)    
    if(data == '' or data == None or str(data) == None):
        return np.array([None])
    else:
        #return np.array(str(data).lower().split(delimiter))    
        lst = np.array(str(data).split(delimiter))       
        return lst
       
def lower(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).lower() 
    
def upper(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).upper()     

def replaceNaN(data):
    if(np.isnan(data)): 
        return None
    else:
        return data
    

# Python does not have switch statment, rather use dict approach
parser = {
        'int':parse_int,
        'date':parse_date,
        'bool':parse_bool
    }

In [68]:
LESSON_DATA_FOLDER = './data/'

fileColumnMapping = {
'title.basics.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'titleType':np.dtype('S'), 
                 'primaryTitle':np.dtype('S'),
                 'originalTitle':np.dtype('S'),
                 'isAdult':np.dtype('S'),
                 'startYear':np.dtype('S'),
                 'endYear':np.dtype('S'),             
                 'runtimeMinutes':np.dtype('S'),
                 'genres':np.dtype('S')
                    },      
      'filePath':LESSON_DATA_FOLDER + 'title.basics.tsv',
      'to_replace':{
          'titleType':{'\\N':None},
          'primaryTitle':{'\\N':None},
          'originalTitle':{'\\N':None},
          'startYear':{'\\N':None},
          'endYear':{'\\N':None},
          'runtimeMinutes':{'\\N':None}
      },
     'true_values':['1'],
     'false_values':['0'],     
     'usecols':['tconst','titleType','primaryTitle','originalTitle','isAdult','startYear','endYear','runtimeMinutes','genres'],
     'converters' : {
                 'primaryTitle':[lower],
                 'titleType':[lower],
                 'originalTitle':[lower],
                 'isAdult':[parse_bool],
                 'startYear':[parse_int] ,
                 'endYear':[parse_int]  ,
                 'runtimeMinutes':[parse_int],
                 'genres': [split]
                }
    },
'title.crew.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'directors':np.dtype('S') ,'writers':np.dtype('S')  },
      'split' : ['directors','writers'],
      'filePath':LESSON_DATA_FOLDER + 'title.crew.tsv',
      'to_replace':{
          'directors':{'\\N':None},
          'writers':{'\\N':None},          
      },
     'true_values':None,
     'false_values':None,     
     'usecols': None,
     'converters' : {                 
                 'writers':[split],
                 'directors': [split]
                }
    }, 
'title.episode.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'parentTconst':np.dtype('S'),
                 'seasonNumber':np.dtype('S'),  
                 'episodeNumber':np.dtype('S')  
                },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.episode.tsv',
      'to_replace':{
          'seasonNumber':{'\\N':None},
          'episodeNumber':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'seasonNumber':[parse_int],
                 'episodeNumber': [parse_int]
                }
    },
'title.principals.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'ordering':np.dtype('S'),
                 'nconst':np.dtype('S'),
                 'category':np.dtype('S'),
                 'job':np.dtype('S'),                 
                 'characters':np.dtype('S'),                 
                },
      'split' : None,
      'filePath':LESSON_DATA_FOLDER + 'title.principals.tsv',
      'to_replace':{
          'job':{'\\N':None},
          'characters':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,
      'converters' : {                 
                 'ordering':[parse_int]                 
                },
      'usecols': None      
    },    
'title.ratings.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'averageRating':np.float64 ,'numVotes':np.int32  },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.ratings.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'numVotes':[parse_int]                 
                }    
    },  
'name.basics.tsv': {
      'index_col': 0, 
      'dtype' : {'nconst':np.dtype('S'),
                 'primaryName':np.dtype('S') ,
                 'birthYear':np.dtype('S')  ,
                 'deathYear':np.dtype('S'),
                 'primaryProfession':np.dtype('S'),
                 'knownForTitles':np.dtype('S')
                },      
      'filePath':LESSON_DATA_FOLDER + 'name.basics.tsv',
      'to_replace':{
          'primaryProfession':{'\\N':None},
          'knownForTitles':{'\\N':None},    
          'birthYear':{'\\N':None},    
          'deathYear':{'\\N':None},       
      },
      'true_values':None,
      'false_values':None,
      'usecols': None,    
      'converters' : {                 
             'primaryName':[lower],                 
             'birthYear':[parse_int],                 
             #'deathYear':[parse_int],
             'primaryProfession' :[split],
             'knownForTitles':[split],          
            } 
    },    
}

In [69]:
import pandas as pd

def getMapping(file):
    return fileColumnMapping.get(file)

def callFunction(columnData, **funDict):
    
    converFuns = funDict[columnData.name]
    
    if(converFuns != None):
        for fun in converFuns:
            columnData = columnData.apply(fun)
            
    return columnData

def readFile(file, nrows=None ):   
    
    mapping = getMapping(file)
    dtype = mapping['dtype']
    usecols = list(dtype.keys())

    df = pd.read_table(mapping['filePath'], 
                       index_col=mapping['index_col'], 
                       dtype = dtype, 
                       #na_values = ['//N'],
                       true_values= mapping['true_values'],
                       false_values= mapping['false_values'],                       
                       usecols=usecols,
                       nrows =nrows 
                      )
    df.fillna(method='pad', inplace=True)
    if(mapping['to_replace']!= None):
        df.replace(to_replace=mapping['to_replace'],method='pad', inplace=True)
    
    converters = mapping['converters']
    if(converters!= None):
        cols = list(converters.keys())
        df[cols] = df[cols].apply(callFunction, **converters)
        
    
    return df


In [23]:
names = readFile('name.basics.tsv')
names.head()

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,fred astaire,1899.0,1987.0,"[soundtrack, actor, miscellaneous]","[tt0043044, tt0050419, tt0053137, tt0072308]"
nm0000002,lauren bacall,1924.0,2014.0,"[actress, soundtrack]","[tt0117057, tt0040506, tt0038355, tt0037382]"
nm0000003,brigitte bardot,1934.0,,"[actress, soundtrack, producer]","[tt0057345, tt0059956, tt0063715, tt0049189]"
nm0000004,john belushi,1949.0,1982.0,"[actor, writer, soundtrack]","[tt0072562, tt0080455, tt0078723, tt0077975]"
nm0000005,ingmar bergman,1918.0,2007.0,"[writer, director, actor]","[tt0060827, tt0050986, tt0083922, tt0050976]"


In [24]:
crew = readFile('title.crew.tsv')
crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,[nm0005690],[None]
1,tt0000002,[nm0721526],[None]
2,tt0000003,[nm0721526],[None]
3,tt0000004,[nm0721526],[None]
4,tt0000005,[nm0005690],[None]


In [37]:
def denormalize_crew(df, column):
    denormalize_df = pd.DataFrame(columns=['tconst',column])
    
    for index, row in df.iterrows():    
        tconst = row['tconst']
        if((row[column] != None).all()):            
            array = row[column].tolist()
            for item in array:
                info = names.loc[[item]].to_dict('index')
                crew_info = info[item]
                crew_info['tconst'] = tconst
                crew_info[column] = item                
                #print (names.loc[item])
                denormalize_df = denormalize_df.append(crew_info, ignore_index =True)    
        else:
            crew_info = {'tconst':tconst,column:None}
            denormalize_df = denormalize_df.append(crew_info, ignore_index =True)    
                
    return denormalize_df            

In [80]:
title_directors = denormalize_crew(crew.head(10), 'directors')
print(title_directors.to_json(orient='records'))

[{"tconst":"tt0000001","directors":"nm0005690","birthYear":1860.0,"deathYear":"1935","knownForTitles":["tt0241763","tt0361921","tt1428455","tt0229235"],"primaryName":"william k.l. dickson","primaryProfession":["cinematographer","director","producer"]},{"tconst":"tt0000002","directors":"nm0721526","birthYear":1844.0,"deathYear":"1918","knownForTitles":["tt0000004","tt0000015","tt0000003","tt0000002"],"primaryName":"\u00e9mile reynaud","primaryProfession":["director"]},{"tconst":"tt0000003","directors":"nm0721526","birthYear":1844.0,"deathYear":"1918","knownForTitles":["tt0000004","tt0000015","tt0000003","tt0000002"],"primaryName":"\u00e9mile reynaud","primaryProfession":["director"]},{"tconst":"tt0000004","directors":"nm0721526","birthYear":1844.0,"deathYear":"1918","knownForTitles":["tt0000004","tt0000015","tt0000003","tt0000002"],"primaryName":"\u00e9mile reynaud","primaryProfession":["director"]},{"tconst":"tt0000005","directors":"nm0005690","birthYear":1860.0,"deathYear":"1935","kno

In [81]:
title_writers = denormalize_crew(crew.head(10), 'writers')
title_writers

Unnamed: 0,tconst,writers,birthYear,deathYear,knownForTitles,primaryName,primaryProfession
0,tt0000001,,,,,,
1,tt0000002,,,,,,
2,tt0000003,,,,,,
3,tt0000004,,,,,,
4,tt0000005,,,,,,
5,tt0000006,,,,,,
6,tt0000007,,,,,,
7,tt0000008,,,,,,
8,tt0000009,nm0085156,1859.0,1940.0,[tt0000009],alexander black,"[director, writer, cinematographer]"
9,tt0000010,,,,,,


In [84]:
title_writers.replace(to_replace={'knownForTitles':{np.NaN:None}},inplace=True)
directors_grouped = title_directors.groupby('tconst')

In [89]:
directors_grouped.get_group('tt0000007').to_json(orient='records')

'[{"tconst":"tt0000007","directors":"nm0005690","birthYear":1860.0,"deathYear":"1935","knownForTitles":["tt0241763","tt0361921","tt1428455","tt0229235"],"primaryName":"william k.l. dickson","primaryProfession":["cinematographer","director","producer"]},{"tconst":"tt0000007","directors":"nm0374658","birthYear":null,"deathYear":"1910","knownForTitles":["tt0154152","tt0229235","tt0219560","tt0361921"],"primaryName":"william heise","primaryProfession":["cinematographer","director","producer"]}]'

In [70]:
titles = readFile('title.basics.tsv')
titles.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,carmencita,carmencita,False,1894.0,,1.0,"[Documentary, Short]"
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892.0,,5.0,"[Animation, Short]"
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892.0,,4.0,"[Animation, Comedy, Romance]"
3,tt0000004,short,un bon bock,un bon bock,False,1892.0,,,"[Animation, Short]"
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893.0,,1.0,[Short]


In [71]:
ratings = readFile('title.ratings.tsv')
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1350
1,tt0000002,6.5,157
2,tt0000003,6.6,934
3,tt0000004,6.4,93
4,tt0000005,6.2,1622


In [128]:
titles_merged_directors = pd.DataFrame()

for index, title in titles.head(10).iterrows():   
    tconst = title['tconst']    
    
    title['directors'] = directors_grouped.get_group(tconst)
    #print(title)
    #print(title['directors'] )
    titles_merged_directors = titles_merged_directors.append(title, ignore_index=True)
    #directors_grouped.get_group('tt0000007').to_json(orient='records')

    
#titles_merged_directors.to_json(orient='records')
titles_merged_directors.loc[[6],['directors']].to_json(orient='records')

'[{"directors":[{"tconst":"tt0000007","directors":"nm0005690","birthYear":1860.0,"deathYear":"1935","knownForTitles":["tt0241763","tt0361921","tt1428455","tt0229235"],"primaryName":"william k.l. dickson","primaryProfession":["cinematographer","director","producer"]},{"tconst":"tt0000007","directors":"nm0374658","birthYear":null,"deathYear":"1910","knownForTitles":["tt0154152","tt0229235","tt0219560","tt0361921"],"primaryName":"william heise","primaryProfession":["cinematographer","director","producer"]}]}]'

In [8]:
titles.merge(crew, on=['tconst'])

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,tt0000001,short,carmencita,carmencita,False,1894,,1.0,"[documentary, short]",5.8,1350,[nm0005690],
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892,,5.0,"[animation, short]",6.5,157,[nm0721526],
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892,,4.0,"[animation, comedy, romance]",6.6,934,[nm0721526],
3,tt0000004,short,un bon bock,un bon bock,False,1892,,,"[animation, short]",6.4,93,[nm0721526],
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893,,1.0,[short],6.2,1622,[nm0005690],
5,tt0000006,short,chinese opium den,chinese opium den,False,1894,,1.0,[short],5.7,80,[nm0005690],
6,tt0000007,short,corbett and courtney before the kinetograph,corbett and courtney before the kinetograph,False,1894,,1.0,"[short, sport]",5.5,544,"[nm0005690, nm0374658]",
7,tt0000008,short,edison kinetoscopic record of a sneeze,edison kinetoscopic record of a sneeze,False,1894,,1.0,"[documentary, short]",5.6,1441,[nm0005690],
8,tt0000009,movie,miss jerry,miss jerry,False,1894,,45.0,[romance],5.4,62,[nm0085156],[nm0085156]
9,tt0000010,short,employees leaving the lumière factory,la sortie de l'usine lumière à lyon,False,1895,,1.0,"[documentary, short]",6.9,4854,[nm0525910],


In [34]:
#names.info()
#del names
'''Important '''
#names.loc[['nm0000001'],['primaryName','birthYear','deathYear','primaryProfession']].to_dict('index')




{'nm0000001': {'birthYear': 1899.0,
  'deathYear': '1987',
  'primaryName': 'fred astaire',
  'primaryProfession': array(['soundtrack', 'actor', 'miscellaneous'], dtype='<U13')}}

In [25]:
s1 = pd.Series({'aa':0, 'ba':1,'ca':2})
s2 = pd.Series({'ab':3, 'bb':4,'cb':5})


df = pd.DataFrame (data={
    'a':[1,2],
    'x':[s1,s2],
    'y':[['aa','ba','ca'],['ab','bb','cb']]
})
df.iloc[0:].to_json(orient='records')


'[{"a":1,"x":[0,1,2],"y":["aa","ba","ca"]},{"a":2,"x":[3,4,5],"y":["ab","bb","cb"]}]'

In [21]:
directors_info = pd.DataFrame()
