In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import sys as sys
import datetime as dt
import time

import ipyparallel

cluster = ipyparallel.Client(profile='parallel-profile',sshserver='paperspace@172.83.14.24',password='Feb)@2018')
print('profile:', cluster.profile)
print("IDs:", cluster.ids) # Print process id numbers
#print(cluster)


#http://www.imdb.com/interfaces/

profile: parallel-profile
IDs: [0, 1, 2, 3]


In [11]:
dview = cluster[:]
with dview.sync_imports():
    import pandas
    import numpy
    import datetime
    import time

importing pandas on engine(s)
importing numpy on engine(s)
importing datetime on engine(s)
importing time on engine(s)


In [6]:
from datetime import datetime as dt

def index_array(length):
    result = [x for x in range(length)]   
    return result

def parse_date(date):
    if str(date) == '' or date == None:
        return None
    else:
        return dt.strptime(date,'%Y-%m-%d')    

def parse_int(i, defaultValue=None):    
   # print (defaultValue)
    if ( i == None or str(i) == '' or str(i) == 'NaN' or i == np.NaN) :
        return defaultValue   
    else:
        try:
            #print ('convert', int(i))
            return int(float(i))
        except:
            return i
    
def parse_float(f):
    if str(f) == '' or f == None:
        return None
    else:
        try:
            return float(f)
        except:
            return f
    
def parse_bool(boolean):
    if str(boolean) == '' or boolean == None:
        return None
    else:        
        return boolean =='True' 
    
def split(data, delimiter=',' ):
    #print ('data: ', data)    
    if(data == '' or data == None or str(data) == None):
        return np.array([None])
    else:
        #return np.array(str(data).lower().split(delimiter))    
        lst = np.array(str(data).split(delimiter))       
        return lst    
    
def lower(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).lower() 
    
def upper(data):
    if(data == '' or data == None):
        return None
    else:
        return str(data).upper()     

def replaceNaN(data):
    if(np.isnan(data)): 
        return None
    else:
        return data
    

# Python does not have switch statment, rather use dict approach
parser = {
        'int':parse_int,
        'date':parse_date,
        'bool':parse_bool
    }

In [7]:
LESSON_DATA_FOLDER = './data/'

fileColumnMapping = {
'title.basics.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'titleType':np.dtype('S'), 
                 'primaryTitle':np.dtype('S'),
                 'originalTitle':np.dtype('S'),
                 'isAdult':np.dtype('S'),
                 'startYear':np.dtype('S'),
                 'endYear':np.dtype('S'),             
                 'runtimeMinutes':np.dtype('S'),
                 'genres':np.dtype('S')
                    },      
      'filePath':LESSON_DATA_FOLDER + 'title.basics.tsv',
      'to_replace':{
          'titleType':{'\\N':None},
          'primaryTitle':{'\\N':None},
          'originalTitle':{'\\N':None},
          'startYear':{'\\N':None},
          'endYear':{'\\N':None},
          'runtimeMinutes':{'\\N':None}
      },
     'true_values':['1'],
     'false_values':['0'],     
     'usecols':['tconst','titleType','primaryTitle','originalTitle','isAdult',
                'startYear','endYear','runtimeMinutes','genres'],
     'converters' : {
                 'primaryTitle':[{'function':lower, 'args':None}],
                 'titleType':[{'function':lower, 'args':None}],
                 'originalTitle':[{'function':lower, 'args':None}],
                 'isAdult':[{'function':parse_bool, 'args':None}],
                 'startYear':[{'function':parse_int, 'args':None}] ,
                 'endYear':[{'function':parse_int, 'args':None}]  ,
                 'runtimeMinutes':[{'function':parse_int, 'args':None}],
                 'genres': [{'function':split, 'args':(',',)}]
                 
                }
    },
'title.crew.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'directors':np.dtype('S') ,'writers':np.dtype('S')  },
      'split' : ['directors','writers'],
      'filePath':LESSON_DATA_FOLDER + 'title.crew.tsv',
      'to_replace':{
          'directors':{'\\N':None},
          'writers':{'\\N':None},          
      },
     'true_values':None,
     'false_values':None,     
     'usecols': None,
     'converters' : {                 
                 'writers':[{'function':split, 'args':(',',)}],
                 'directors': [{'function':split, 'args':(',',)}]
                }
    }, 
'title.episode.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'parentTconst':np.dtype('S'),
                 'seasonNumber':np.dtype('S'),  
                 'episodeNumber':np.dtype('S')  
                },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.episode.tsv',
      'to_replace':{
          'seasonNumber':{'\\N':None},
          'episodeNumber':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'seasonNumber':[{'function':parse_int, 'args':None}],
                 'episodeNumber': [{'function':parse_int, 'args':None}]
                }
    },
'title.principals.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),
                 'ordering':np.dtype('S'),
                 'nconst':np.dtype('S'),
                 'category':np.dtype('S'),
                 'job':np.dtype('S'),                 
                 'characters':np.dtype('S'),                 
                },
      'split' : None,
      'filePath':LESSON_DATA_FOLDER + 'title.principals.tsv',
      'to_replace':{
          'job':{'\\N':None},
          'characters':{'\\N':None},                   
      },
      'true_values':None,
      'false_values':None,
      'converters' : {                 
                 'ordering':[{'function':parse_int, 'args':None}]             
                },
      'usecols': None      
    },    
'title.ratings.tsv': {
      'index_col': None, 
      'dtype' : {'tconst':np.dtype('S'),'averageRating':np.float64 ,'numVotes':np.int32  },
      'split' :None,
      'filePath':LESSON_DATA_FOLDER + 'title.ratings.tsv',
      'to_replace':None,
      'true_values':None,
      'false_values':None,      
      'usecols': None,
      'converters' : {                 
                 'numVotes':[{'function':parse_int, 'args':None}]               
                }    
    },  
'name.basics.tsv': {
      'index_col': None, 
      'dtype' : {'nconst':np.dtype('S'),
                 'primaryName':np.dtype('S') ,
                 'birthYear':np.dtype('S')  ,
                 'deathYear':np.dtype('S'),
                 'primaryProfession':np.dtype('S'),
                 'knownForTitles':np.dtype('S')
                },      
      'filePath':LESSON_DATA_FOLDER + 'name.basics.tsv',
      'to_replace':{
          'primaryProfession':{'\\N':None},
          'knownForTitles':{'\\N':None},    
          'birthYear':{'\\N':None},    
          'deathYear':{'\\N':None},       
      },
      'true_values':None,
      'false_values':None,
      'usecols': None,    
      'converters' : {                 
             'primaryName':[{'function':lower, 'args':None}],                 
             'birthYear':[{'function':parse_int, 'args':(0,)}],                 
             'deathYear':[{'function':parse_int, 'args':(0,)}],
             'primaryProfession' :[{'function':split, 'args':(',',)}],
             'knownForTitles':[{'function':split, 'args':(',',)}],          
            } 
    },    
}

In [9]:
import pandas as pd

def getMapping(file):
    return fileColumnMapping.get(file)

def callFunction(columnData, **funDict):
    
    converFuns = funDict[columnData.name] #[{'function':split, 'args':(',')}]
    
    if(converFuns != None):
        for funSpec in converFuns:
            params = funSpec['args']
            #print ('args: ', params, ' <> ', params == None)
            if(params == None):                
                columnData = columnData.apply(funSpec['function'] )
            else:            
                columnData = columnData.apply(funSpec['function'], args=params )
            
    return columnData

def readFile(file, nrows=None ):   
    print ('Start: ' + time.strftime("%Y-%m-%d %H:%M"))
    dview = cluster[:]
    mapping = getMapping(file)
    dtype = mapping['dtype']
    usecols = list(dtype.keys())
    
    dview.scatter(
        "df", 
        pd.read_table(mapping['filePath'], 
                       index_col=mapping['index_col'], 
                       dtype = dtype, 
                       #na_values = ['//N'],
                       true_values= mapping['true_values'],
                       false_values= mapping['false_values'],                       
                       usecols=usecols,
                       nrows =nrows 
                      )
    )
    df = pd.concat([i for i in dview["df"]])
    
    df.fillna(method='pad', inplace=True)
    if(mapping['to_replace']!= None):
        df.replace(to_replace=mapping['to_replace'],method='pad', inplace=True)
    
    converters = mapping['converters']
    if(converters!= None):
        cols = list(converters.keys())
        df[cols] = df[cols].apply(callFunction, **converters)
        
    print ('End: ' + time.strftime("%Y-%m-%d %H:%M"))
    return df

In [10]:
names = readFile('name.basics.tsv')
names.head()
#ddf = dd.from_pandas(names, npartitions=2)

Start: 2018-03-01 16:39
End: 2018-03-01 16:41


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,fred astaire,1899,1987,"[soundtrack, actor, miscellaneous]","[tt0043044, tt0050419, tt0053137, tt0072308]"
1,nm0000002,lauren bacall,1924,2014,"[actress, soundtrack]","[tt0117057, tt0040506, tt0038355, tt0037382]"
2,nm0000003,brigitte bardot,1934,0,"[actress, soundtrack, producer]","[tt0057345, tt0059956, tt0063715, tt0049189]"
3,nm0000004,john belushi,1949,1982,"[actor, writer, soundtrack]","[tt0072562, tt0080455, tt0078723, tt0077975]"
4,nm0000005,ingmar bergman,1918,2007,"[writer, director, actor]","[tt0060827, tt0050986, tt0083922, tt0050976]"


In [11]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8445648 entries, 0 to 8445647
Data columns (total 6 columns):
nconst               object
primaryName          object
birthYear            int64
deathYear            int64
primaryProfession    object
knownForTitles       object
dtypes: int64(2), object(4)
memory usage: 386.6+ MB


In [12]:
crew = readFile('title.crew.tsv')#.compute()
crew.info()

Start: 2018-03-01 16:42
End: 2018-03-01 16:43
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4828274 entries, 0 to 4828273
Data columns (total 3 columns):
tconst       object
directors    object
writers      object
dtypes: object(3)
memory usage: 110.5+ MB


In [13]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [14]:
title_writers = explode(crew, ['writers'], fill_value='')
title_writers.drop('directors', axis=1, inplace=True)
#title_writers.info()
title_writers.rename(columns={'writers': 'nconst'}, inplace=True)
title_writers.columns

Index(['tconst', 'nconst'], dtype='object')

In [15]:
title_directors = explode(crew, ['directors'], fill_value='')
title_directors.drop('writers', axis=1, inplace=True)
#title_directors.info()

title_directors.rename(columns={'directors': 'nconst'}, inplace=True)
title_directors.columns

Index(['tconst', 'nconst'], dtype='object')

In [16]:
title_writers = title_writers.merge(names, on=['nconst'])
title_writers.head()

Unnamed: 0,tconst,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0000009,nm0085156,alexander black,1859,1940,"[director, writer, cinematographer]",[tt0000009]
1,tt0000036,nm0410331,washington irving,1783,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
2,tt0000076,nm0410331,washington irving,1783,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
3,tt0000108,nm0410331,washington irving,1783,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"
4,tt0000109,nm0410331,washington irving,1783,1859,[writer],"[tt0162661, tt0214874, tt0041094, tt0051850]"


In [17]:
title_directors = title_directors.merge(names, on=['nconst'])
title_directors.head()

Unnamed: 0,tconst,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0000001,nm0005690,william k.l. dickson,1860,1935,"[cinematographer, director, producer]","[tt0241763, tt0361921, tt1428455, tt0229235]"
1,tt0000005,nm0005690,william k.l. dickson,1860,1935,"[cinematographer, director, producer]","[tt0241763, tt0361921, tt1428455, tt0229235]"
2,tt0000006,nm0005690,william k.l. dickson,1860,1935,"[cinematographer, director, producer]","[tt0241763, tt0361921, tt1428455, tt0229235]"
3,tt0000007,nm0005690,william k.l. dickson,1860,1935,"[cinematographer, director, producer]","[tt0241763, tt0361921, tt1428455, tt0229235]"
4,tt0000008,nm0005690,william k.l. dickson,1860,1935,"[cinematographer, director, producer]","[tt0241763, tt0361921, tt1428455, tt0229235]"


In [72]:
#title_writers.replace(to_replace={'knownForTitles':{np.NaN:None}},inplace=True)
#title_grouped_directors = 
title_grouped_directors = title_directors.head(10000).groupby('tconst')#.apply(f)
title_grouped_directors

<pandas.core.groupby.DataFrameGroupBy object at 0x7f6171835940>

In [73]:
    #@dview.parallel(block=True)
    def f(group):
        #print (len(group))

        #group = pandas.DataFrame(group.to_dict('dict'))

        group = group.set_index('tconst')    
        data = {'directors': group.to_json(orient='records')}
        #data = {'directors': group.to_dict('dict')}
        #print (group.index.values , directors)
        return pandas.DataFrame(data=data, index = group.index.values)  



#title_grouped_directors# = 
#title_grouped_directors.reset_index().drop(['level_1'], axis= 1)
#pd.DataFrame(title_grouped_directors).to_json(orient='records')
#title_grouped_directors.reset_index(level='tconst')

In [74]:
print (time.strftime("%Y-%m-%d %H:%M:%S"))
dview = cluster[:]
dview.scatter("scview", title_grouped_directors.apply(f))
#dview['scview']
title_grouped_directors = pd.concat([i for i in dview["scview"]])
print (time.strftime("%Y-%m-%d %H:%M:%S"))
#title_grouped_directors = title_grouped_directors.reset_index().drop(['level_1'], axis= 1)


2018-03-01 17:04:05:03s
2018-03-01 17:04:16:03s


In [63]:
#title_grouped_directors = title_grouped_directors.reset_index().drop(['level_1'], axis= 1)
#title_grouped_directors
#title_directors.info() # 3425684 
#del names
#del crew
title_grouped_directors

Unnamed: 0_level_0,Unnamed: 1_level_0,directors
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,tt0000001,
tt0000002,tt0000002,
tt0000003,tt0000003,
tt0000004,tt0000004,
tt0000005,tt0000005,
tt0000006,tt0000006,
tt0000007,tt0000007,
tt0000007,tt0000007,
tt0000008,tt0000008,
tt0000009,tt0000009,


In [32]:
ratings = readFile('title.ratings.tsv')
ratings.head(5)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.8,1350
1,tt0000002,6.5,157
2,tt0000003,6.6,934
3,tt0000004,6.4,93
4,tt0000005,6.2,1622


In [30]:
titles = readFile('title.basics.tsv')
titles.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,carmencita,carmencita,False,1894.0,,1.0,"[Documentary, Short]"
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892.0,,5.0,"[Animation, Short]"
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892.0,,4.0,"[Animation, Comedy, Romance]"
3,tt0000004,short,un bon bock,un bon bock,False,1892.0,,,"[Animation, Short]"
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893.0,,1.0,[Short]


In [38]:
titles = titles.merge(ratings, on=['tconst'])
titles.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,carmencita,carmencita,False,1894.0,,1.0,"[Documentary, Short]",5.8,1350
1,tt0000002,short,le clown et ses chiens,le clown et ses chiens,False,1892.0,,5.0,"[Animation, Short]",6.5,157
2,tt0000003,short,pauvre pierrot,pauvre pierrot,False,1892.0,,4.0,"[Animation, Comedy, Romance]",6.6,934
3,tt0000004,short,un bon bock,un bon bock,False,1892.0,,,"[Animation, Short]",6.4,93
4,tt0000005,short,blacksmith scene,blacksmith scene,False,1893.0,,1.0,[Short],6.2,1622


In [36]:
pd.merge(titles, title_grouped_directors , on=['tconst']) 

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,level_1,birthYear,deathYear,knownForTitles,nconst,primaryName
0,tt8043272,short,the netball diaries,the netball diaries,False,2007.0,,,"[Comedy, Short]",3425634,0,0,[None],nm9648242,katrina mercer
1,tt8043344,tvepisode,san francisco ja muir woods,san francisco ja muir woods,False,2018.0,,60.0,[Documentary],3425635,0,0,[tt7290998],nm7951335,ilari suhonen
2,tt8043354,movie,el nuevo orden,el nuevo orden,False,2018.0,,45.0,[Horror],3425636,0,0,[None],nm9648302,sergi rodón
3,tt8043492,tvseries,piano no mori,piano no mori,False,2018.0,,,"[Adventure, Animation, Drama]",3425637,0,0,"[tt0892769, tt0351283, tt0298148, tt0181689]",nm1185804,gaku nakatani
4,tt8043554,short,et liv,et liv,False,2018.0,,,"[Drama, Short]",3425638,0,0,[tt1310630],nm3279624,ane skak
5,tt8043572,movie,flakka 666,flakka 666,False,2018.0,,71.0,"[Horror, Thriller]",3425639,0,0,[None],nm9648488,luis rodriguez
6,tt8043724,short,my favourite paul,my favourite paul,False,2018.0,,,"[Drama, Short]",3425640,0,0,"[tt5914902, tt3680440]",nm6536252,jacinta owens
7,tt8043758,tvseries,overland,overland,False,1995.0,,,"[Adventure, Documentary]",3425641,0,0,[None],nm9648522,beppe tenti
8,tt8043758,tvseries,overland,overland,False,1995.0,,,"[Adventure, Documentary]",3425642,0,0,[None],nm9648521,filippo tenti
9,tt8043810,short,adobe,adobe,False,2018.0,,7.0,"[Action, Drama, Sci-Fi]",3425643,0,0,"[tt4865754, tt6657132, tt2912216, tt3595744]",nm7074017,ashley davidson


In [36]:
#names.info()
#del names
'''Important '''
#names.loc[['nm0000001'],['primaryName','birthYear','deathYear','primaryProfession']].to_dict('index')


del titles

In [50]:
s1 = pd.Series({'aa':0, 'ba':1,'ca':2})
s2 = pd.Series({'ab':3, 'bb':4,'cb':5})


df = pd.DataFrame (data={
    'a':[1,2],
    'x':[s1,s2],
    'y':[['aa','ba','ca'],['ab','bb','cb']],
    'z':['a,b,c', 'd,e,f']
})
#df.iloc[0:].to_json(orient='records')
split_data = df['z'].str.split(',').apply(pd.Series, 1).stack()
#df_new.index
split_data.index = split_data.index.droplevel(-1)
split_data
df = df.join (pd.DataFrame(split_data))
df = df.reset_index(drop=True)
df

Unnamed: 0,a,x,y,z,0
0,1,aa 0 ba 1 ca 2 dtype: int64,"[aa, ba, ca]","a,b,c",a
1,1,aa 0 ba 1 ca 2 dtype: int64,"[aa, ba, ca]","a,b,c",b
2,1,aa 0 ba 1 ca 2 dtype: int64,"[aa, ba, ca]","a,b,c",c
3,2,ab 3 bb 4 cb 5 dtype: int64,"[ab, bb, cb]","d,e,f",d
4,2,ab 3 bb 4 cb 5 dtype: int64,"[ab, bb, cb]","d,e,f",e
5,2,ab 3 bb 4 cb 5 dtype: int64,"[ab, bb, cb]","d,e,f",f


In [78]:
pd.DataFrame({'directors': '[{"tconst":"tt8046488","nconst":"nm9649918","primaryName":"ricky capo","birthYear":0,"deathYear":0,"knownForTitles":[null]}]'}, index=['tt8046488'])


Unnamed: 0,directors
tt8046488,"[{""tconst"":""tt8046488"",""nconst"":""nm9649918"",""p..."


In [70]:
df = pd.DataFrame([1,3.09])
df[0] = df[0].apply(np.int)
df

Unnamed: 0,0
0,1
1,3


In [2]:
ticket_series = df['Ticket'].str.split(' ').apply(pd.Series, 1).stack()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5509644 entries, 0 to 5509643
Data columns (total 2 columns):
tconst       object
directors    object
dtypes: object(2)
memory usage: 84.1+ MB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7792376 entries, 0 to 7792375
Data columns (total 2 columns):
tconst     object
writers    object
dtypes: object(2)
memory usage: 118.9+ MB
