In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import sys as sys

#http://www.imdb.com/interfaces/

In [None]:
from datetime import datetime as dt


def parse_date(date):
    if date == '' or date == None:
        return None
    else:
        return dt.strptime(date,'%Y-%m-%d')
    

def parse_int(i):
    if i == '' or i == None:
        return None
    else:
        return int(i)
    
def parse_float(f):
    if f == '' or f == None:
        return None
    else:
        return float(f)
    
def parse_bool(boolean, dic):
    if boolean == '' or boolean == None:
        return None
    else:
        return boolean =='True' 

# Python does not have switch statment, rather use dict approach
parser = {
        'int':parse_int,
        'date':parse_date,
        'bool':parse_bool
    }

In [2]:
LESSON_DATA_FOLDER = './data/'

fileColumnMapping = {
'title.basics.tsv': {
      'index_col': 0, 
      'dtype' : {'tconst':np.dtype('S'),
                 'titleType':np.dtype('S'), 
                 'primaryTitle':np.dtype('S'),
                 'originalTitle':np.dtype('S'),
                 'isAdult':np.dtype('?'),
                 'startYear':np.dtype('S'),
                 'endYear':np.dtype('S'),             
                 'runtimeMinutes':np.dtype('S'),
                 'genres':np.dtype('S')
                    },
      'split' : ['genres'],
      'filePath':LESSON_DATA_FOLDER + 'title.basics.tsv',
      'to_replace':{
          'titleType':{'\\N':None},
          'primaryTitle':{'\\N':None},
          'originalTitle':{'\\N':None},
          'startYear':{'\\N':None},
          'endYear':{'\\N':None},
          'runtimeMinutes':{'\\N':None}
      },
     'true_values':None,#[1],
     'false_values':None,#[0],
     'converters': None,
     'usecols':['tconst','titleType','primaryTitle','originalTitle','isAdult','startYear','endYear','runtimeMinutes','genres']
    },
'title.crew.tsv': {
      'index_col': 0, 
      'dtype' : {'tconst':np.dtype('S'),'directors':np.dtype('S') ,'writers':np.dtype('S')  },
      'split' : ['directors','writers'],
      'filePath':LESSON_DATA_FOLDER + 'title.crew.tsv',
      'to_replace':{
          'directors':{'\\N':None},
          'writers':{'\\N':None},          
      },
     'true_values':None,
     'false_values':None,
     'converters': None,
     'usecols': None
    }, 
'title.episode.tsv': {
      'index_col': 0, 
      'dtype' : {'tconst':np.dtype('S'),
                 'parentTconst':np.dtype('S'),
                 'seasonNumber':np.int32,  
                 'episodeNumber':np.int32  
                },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.episode.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,
      'converters':None,
      'usecols': None      
    },
'title.principals.tsv': {
      'index_col': 0, 
      'dtype' : {'tconst':np.dtype('S'),
                 'principalCast':np.dtype('S'),                 
                },
      'split' : ['principalCast'],
      'filePath':LESSON_DATA_FOLDER + 'title.principals.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,
      'converters':None,
      'usecols': None      
    },    
'title.ratings.tsv': {
      'index_col': 0, 
      'dtype' : {'tconst':np.dtype('S'),'averageRating':np.float64 ,'numVotes':np.int32  },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.ratings.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,
      'converters':None,
      'usecols': None      
    },  
'name.basics.tsv': {
      'index_col': 0, 
      'dtype' : {'nconst':np.dtype('S'),
                 'primaryName':np.dtype('S') ,
                 'birthYear':np.dtype('S')  ,
                 'deathYear':np.dtype('S'),
                 'primaryProfession':np.dtype('S'),
                 'knownForTitles':np.dtype('S')
                },
      'split' :['primaryProfession','knownForTitles'],
      'filePath':LESSON_DATA_FOLDER + 'name.basics.tsv',
      'to_replace':{
          'primaryProfession':{'\\N':None},
          'knownForTitles':{'\\N':None},     
          'deathYear':{'\\N':None},       
      },
      'true_values':None,
      'false_values':None,
      'converters':None,
      'usecols': None      
    },    
}

In [3]:
import pandas as pd
def getMapping(file):
    return fileColumnMapping.get(file)

def readFile(file, nrows=None ):   
    
    def split(columnData):
        if(columnData == '' or columnData == None):
            return None
        else:
            return str(columnData).split(',')    
            
    
    mapping = getMapping(file)
    dtype = mapping['dtype']
    usecols = list(dtype.keys())

    df = pd.read_table(mapping['filePath'], 
                       index_col=mapping['index_col'], 
                       dtype = dtype, 
                       na_values = ['//N'],
                       true_values= mapping['true_values'],
                       false_values= mapping['false_values'],                       
                       usecols=usecols,
                       nrows =nrows 
                      )
    
    if(mapping['to_replace']!= None):
        df.replace(to_replace=mapping['to_replace'],method='pad', inplace=True)
    
    if(mapping['split']!= None):
        df[mapping['split']] = df[mapping['split']].applymap(split)
        
    return df


In [4]:
ratings = readFile('name.basics.tsv', 1000)
ratings.head()

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,1899,1987.0,"[soundtrack, actor, miscellaneous]","[tt0050419, tt0072308, tt0043044, tt0053137]"
nm0000002,Lauren Bacall,1924,2014.0,"[actress, soundtrack]","[tt0117057, tt0037382, tt0040506, tt0038355]"
nm0000003,Brigitte Bardot,1934,,"[actress, soundtrack, producer]","[tt0049189, tt0063715, tt0059956, tt0057345]"
nm0000004,John Belushi,1949,1982.0,"[actor, writer, soundtrack]","[tt0072562, tt0080455, tt0078723, tt0077975]"
nm0000005,Ingmar Bergman,1918,2007.0,"[writer, director, actor]","[tt0050986, tt0050976, tt0083922, tt0060827]"


In [28]:
ratings.describe()

Unnamed: 0,averageRating,numVotes
count,807548.0,807548.0
mean,6.934575,991.4049
std,1.390154,15311.09
min,1.0,5.0
25%,6.2,9.0
50%,7.1,20.0
75%,7.9,78.0
max,10.0,1916854.0


In [25]:
crew = readFile('title.crew.tsv', 10000)
crew.head(50)


Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,[nm0005690],
tt0000002,[nm0721526],
tt0000003,[nm0721526],
tt0000004,[nm0721526],
tt0000005,[nm0005690],
tt0000006,[nm0005690],
tt0000007,"[nm0005690, nm0374658]",
tt0000008,[nm0005690],
tt0000009,[nm0085156],[nm0085156]
tt0000010,[nm0525910],


In [26]:

basics = readFile('title.basics.tsv', 10)
basics.head()


Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000001,short,Carmencita,Carmencita,False,1894,,1.0,"[Documentary, Short]"
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,False,1892,,5.0,"[Animation, Short]"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,False,1892,,4.0,"[Animation, Comedy, Romance]"
tt0000004,short,Un bon bock,Un bon bock,False,1892,,,"[Animation, Short]"
tt0000005,short,Blacksmith Scene,Blacksmith Scene,False,1893,,1.0,[Short]


In [10]:
data = {'tconst':np.dtype('S'),
                 'titleType':np.dtype('S'), 
                 'primaryTitle':np.dtype('S'),
                 'originalTitle':np.dtype('S'),
                 'isAdult ':np.dtype('?'),
                 'startYear':np.dtype('S'),
                 'endYear':np.dtype('S'),             
                 'runtimeMinutes':np.dtype('S'),
                 'genres':np.dtype('S')
                    }

In [16]:
arr = pd.Series(data)
#arr.keys().values
list(data.keys())

['tconst',
 'titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult ',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000001,short,Carmencita,Carmencita,0,1894,,1,"[Documentary, Short]"
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"[Animation, Short]"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"[Animation, Comedy, Romance]"
tt0000004,short,Un bon bock,Un bon bock,0,1892,,\N,"[Animation, Short]"
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,[Short]
