In [31]:
import pandas as pd
import numpy as np
import glob,os

# Funcion para reducir el uso de memoria en columnas numericas

In [2]:
# We're going to be calculating memory usage a lot,
# so we'll create a function to save us some time!

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)


# Postulantes Educacion

In [46]:
fiuba_1_postulantes_educacion = pd.read_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacion.csv')

In [47]:
fiuba_1_postulantes_educacion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 298231 entries, 0 to 298230
Data columns (total 3 columns):
idpostulante    298231 non-null object
nombre          298231 non-null object
estado          298231 non-null object
dtypes: object(3)
memory usage: 6.8+ MB


In [48]:
for dtype in ['int','object']:
    selected_dtype = fiuba_1_postulantes_educacion.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 0.00 MB
Average memory usage for object columns: 14.32 MB


In [49]:
fiuba_1_postulantes_educacion_obj = fiuba_1_postulantes_educacion.select_dtypes(include=['object']).copy()
fiuba_1_postulantes_educacion_obj.describe()

Unnamed: 0,idpostulante,nombre,estado
count,298231,298231,298231
unique,188752,7,3
top,YlMLGD,Secundario,Graduado
freq,9,110256,194474


In [50]:
estado = fiuba_1_postulantes_educacion_obj.estado
print(estado.head())

estado_cat = estado.astype('category')
print(estado_cat.head())

0    En Curso
1    En Curso
2    En Curso
3    En Curso
4    En Curso
Name: estado, dtype: object
0    En Curso
1    En Curso
2    En Curso
3    En Curso
4    En Curso
Name: estado, dtype: category
Categories (3, object): [Abandonado, En Curso, Graduado]


In [51]:
estado_cat.head().cat.codes

0    1
1    1
2    1
3    1
4    1
dtype: int8

In [52]:
print(mem_usage(estado))
print(mem_usage(estado_cat))

18.54 MB
0.28 MB


In [53]:
converted_obj = pd.DataFrame()

for col in fiuba_1_postulantes_educacion_obj.columns:
    num_unique_values = len(fiuba_1_postulantes_educacion_obj[col].unique())
    num_total_values = len(fiuba_1_postulantes_educacion_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacion_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacion_obj[col]

In [54]:
print(mem_usage(fiuba_1_postulantes_educacion_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_1_postulantes_educacion_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

58.14 MB
18.71 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [55]:
dtypes = converted_obj.dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{'estado': 'category', 'idpostulante': 'object', 'nombre': 'category'}


In [56]:
dtypes.values

array([dtype('O'),
       CategoricalDtype(categories=['Doctorado', 'Master', 'Otro', 'Posgrado', 'Secundario',
                  'Terciario/Técnico', 'Universitario'],
                 ordered=False),
       CategoricalDtype(categories=['Abandonado', 'En Curso', 'Graduado'], ordered=False)],
      dtype=object)

In [70]:
converted_obj.to_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacion-reduced.csv')

In [71]:
fiuba_1_postulantes_educacionDesde15Abril = pd.read_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacionDesde15Abril.csv')

In [72]:
fiuba_1_postulantes_educacionDesde15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397509 entries, 0 to 397508
Data columns (total 3 columns):
idpostulante    397509 non-null object
nombre          397509 non-null object
estado          397509 non-null object
dtypes: object(3)
memory usage: 9.1+ MB


In [73]:
fiuba_1_postulantes_educacionDesde15Abril_obj = fiuba_1_postulantes_educacionDesde15Abril.select_dtypes(include=['object']).copy()
fiuba_1_postulantes_educacionDesde15Abril_obj.describe()

Unnamed: 0,idpostulante,nombre,estado
count,397509,397509,397509
unique,253688,7,3
top,YlMLGD,Universitario,Graduado
freq,9,145069,261312


In [74]:
converted_obj = pd.DataFrame()

for col in fiuba_1_postulantes_educacionDesde15Abril_obj.columns:
    num_unique_values = len(fiuba_1_postulantes_educacionDesde15Abril_obj[col].unique())
    num_total_values = len(fiuba_1_postulantes_educacionDesde15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacionDesde15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacionDesde15Abril_obj[col]

In [75]:
print(mem_usage(fiuba_1_postulantes_educacionDesde15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_1_postulantes_educacionDesde15Abril_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

77.58 MB
24.93 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [76]:
converted_obj.to_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacionDesde15Abril-reduced.csv')

In [77]:
fiuba_1_postulantes_educacionHasta15Abril = pd.read_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacionHasta15Abril.csv')

In [78]:
fiuba_1_postulantes_educacionHasta15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407453 entries, 0 to 407452
Data columns (total 3 columns):
idpostulante    407453 non-null object
nombre          407453 non-null object
estado          407453 non-null object
dtypes: object(3)
memory usage: 9.3+ MB


In [79]:
fiuba_1_postulantes_educacionHasta15Abril_obj = fiuba_1_postulantes_educacionHasta15Abril.select_dtypes(include=['object']).copy()
fiuba_1_postulantes_educacionHasta15Abril_obj.describe()

Unnamed: 0,idpostulante,nombre,estado
count,407453,407453,407453
unique,263788,7,3
top,YlMLGD,Universitario,Graduado
freq,9,155255,266008


In [80]:
converted_obj = pd.DataFrame()

for col in fiuba_1_postulantes_educacionHasta15Abril_obj.columns:
    num_unique_values = len(fiuba_1_postulantes_educacionHasta15Abril_obj[col].unique())
    num_total_values = len(fiuba_1_postulantes_educacionHasta15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacionHasta15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_1_postulantes_educacionHasta15Abril_obj[col]

In [81]:
print(mem_usage(fiuba_1_postulantes_educacionHasta15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_1_postulantes_educacionHasta15Abril_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

79.56 MB
25.55 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [82]:
converted_obj.to_csv('Data/Postulantes-Educacion/fiuba_1_postulantes_educacionHasta15Abril-reduced.csv')

# Verifico que no haya IDs nulos ya que al hacer drop_duplicates dos NaN \ los toma como iguales

In [33]:
fiuba_1_postulantes_educacion['idpostulante'].isnull().any()

False

In [34]:
fiuba_1_postulantes_educacionDesde15Abril['idpostulante'].isnull().any()

False

In [35]:
fiuba_1_postulantes_educacionHasta15Abril['idpostulante'].isnull().any()

False

In [38]:
def nombre_to_value(x):
    if(x == 'Otro'):
        return 0
    if(x == 'Secundario'):
        return 1
    if(x == 'Terciario/Técnico'):
        return 2
    if(x == 'Universitario'):
        return 3
    if(x == 'Posgrado'):
        return 4
    if(x == 'Master'):
        return 5
    if(x == 'Doctorado'):
        return 6

# Merge de CSVs de postulantes educacion

In [87]:
#Para idpostulantes repetidos con distintos estudios alcanzados dejamos el maximo
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Educacion'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
frame['nombre_numerico'] = frame['nombre'].apply(nombre_to_value)
frame = frame.sort_values(['idpostulante', 'nombre_numerico'],ascending=False)
frame = frame.drop_duplicates(subset='idpostulante', keep='first')
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Educacion/postulantes_educacion-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Educacion/fiuba_1_postulantes_educacionDesde15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Educacion/fiuba_1_postulantes_educacion-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Educacion/fiuba_1_postulantes_educacionHasta15Abril-reduced.csv


# Postulantes genero y edad

In [57]:
fiuba_2_postulantes_genero_y_edad = pd.read_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edad.csv')

In [58]:
fiuba_2_postulantes_genero_y_edad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200888 entries, 0 to 200887
Data columns (total 3 columns):
idpostulante       200888 non-null object
fechanacimiento    196138 non-null object
sexo               200888 non-null object
dtypes: object(3)
memory usage: 4.6+ MB


In [59]:
fiuba_2_postulantes_genero_y_edad_obj = fiuba_2_postulantes_genero_y_edad.select_dtypes(include=['object']).copy()
fiuba_2_postulantes_genero_y_edad_obj.describe()

Unnamed: 0,idpostulante,fechanacimiento,sexo
count,200888,196138,200888
unique,200888,15027,3
top,lDbJJx2,1994-05-24,FEM
freq,1,57,101981


In [60]:
converted_obj = pd.DataFrame()

for col in fiuba_2_postulantes_genero_y_edad_obj.columns:
    num_unique_values = len(fiuba_2_postulantes_genero_y_edad_obj[col].unique())
    num_total_values = len(fiuba_2_postulantes_genero_y_edad_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edad_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edad_obj[col]

In [61]:
print(mem_usage(fiuba_2_postulantes_genero_y_edad_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_2_postulantes_genero_y_edad_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

36.52 MB
14.39 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [63]:
dtypes = converted_obj.drop('fechanacimiento',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{'idpostulante': 'object', 'sexo': 'category'}


In [64]:
dtypes.values

array([dtype('O'),
       CategoricalDtype(categories=['FEM', 'MASC', 'NO_DECLARA'], ordered=False)],
      dtype=object)

In [95]:
converted_obj.to_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edad-reduced.csv')

In [97]:
fiuba_2_postulantes_genero_y_edadDesde15Abril = pd.read_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadDesde15Abril.csv')

In [98]:
fiuba_2_postulantes_genero_y_edadDesde15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281385 entries, 0 to 281384
Data columns (total 3 columns):
idpostulante       281385 non-null object
fechanacimiento    268482 non-null object
sexo               281385 non-null object
dtypes: object(3)
memory usage: 6.4+ MB


In [99]:
fiuba_2_postulantes_genero_y_edadDesde15Abril_obj = fiuba_2_postulantes_genero_y_edadDesde15Abril.select_dtypes(include=['object']).copy()
fiuba_2_postulantes_genero_y_edadDesde15Abril_obj.describe()

Unnamed: 0,idpostulante,fechanacimiento,sexo
count,281385,268482,281385
unique,281385,16331,4
top,eQBKd9,1992-01-31,FEM
freq,1,64,140041


In [100]:
converted_obj = pd.DataFrame()

for col in fiuba_2_postulantes_genero_y_edadDesde15Abril_obj.columns:
    num_unique_values = len(fiuba_2_postulantes_genero_y_edadDesde15Abril_obj[col].unique())
    num_total_values = len(fiuba_2_postulantes_genero_y_edadDesde15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edadDesde15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edadDesde15Abril_obj[col]

In [101]:
print(mem_usage(fiuba_2_postulantes_genero_y_edadDesde15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_2_postulantes_genero_y_edadDesde15Abril_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

50.97 MB
19.59 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [102]:
converted_obj.to_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadDesde15Abril-reduced.csv')

In [103]:
fiuba_2_postulantes_genero_y_edadHasta15Abril = pd.read_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadHasta15Abril.csv')

In [104]:
fiuba_2_postulantes_genero_y_edadHasta15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297747 entries, 0 to 297746
Data columns (total 3 columns):
idpostulante       297747 non-null object
fechanacimiento    281127 non-null object
sexo               297747 non-null object
dtypes: object(3)
memory usage: 6.8+ MB


In [105]:
fiuba_2_postulantes_genero_y_edadHasta15Abril_obj = fiuba_2_postulantes_genero_y_edadHasta15Abril.select_dtypes(include=['object']).copy()
fiuba_2_postulantes_genero_y_edadHasta15Abril_obj.describe()

Unnamed: 0,idpostulante,fechanacimiento,sexo
count,297747,281127,297747
unique,297747,16616,4
top,vVwbPGe,1990-10-18,FEM
freq,1,65,149457


In [106]:
converted_obj = pd.DataFrame()

for col in fiuba_2_postulantes_genero_y_edad_obj.columns:
    num_unique_values = len(fiuba_2_postulantes_genero_y_edadHasta15Abril_obj[col].unique())
    num_total_values = len(fiuba_2_postulantes_genero_y_edadHasta15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edadHasta15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_2_postulantes_genero_y_edadHasta15Abril_obj[col]

In [107]:
print(mem_usage(fiuba_2_postulantes_genero_y_edadHasta15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_2_postulantes_genero_y_edad_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

53.85 MB
20.64 MB


Unnamed: 0,before,after
object,3.0,1
category,,1
category,,1


In [108]:
converted_obj.to_csv('Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadHasta15Abril-reduced.csv')

# Verifico que no haya IDs nulos ya que al hacer drop_duplicates dos NaN \ los toma como iguales

In [109]:
fiuba_2_postulantes_genero_y_edad['idpostulante'].isnull().any()

False

In [110]:
fiuba_2_postulantes_genero_y_edadDesde15Abril['idpostulante'].isnull().any()

False

In [111]:
fiuba_2_postulantes_genero_y_edadHasta15Abril['idpostulante'].isnull().any()

False

# Merge de CSVs de postulantes genero y edad

In [112]:
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Genero-Edad'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
frame.drop_duplicates(["idpostulante"],inplace=True)
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Genero-Edad/postulantes_genero_y_edad-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadHasta15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edadDesde15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulantes-Genero-Edad/fiuba_2_postulantes_genero_y_edad-reduced.csv


# Vistas

In [128]:
fiuba_3_vistas = pd.read_csv('Data/Vistas/fiuba_3_vistas.csv')

In [129]:
fiuba_3_vistas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 961897 entries, 0 to 961896
Data columns (total 3 columns):
idAviso         961897 non-null int64
timestamp       961897 non-null object
idpostulante    961897 non-null object
dtypes: int64(1), object(2)
memory usage: 22.0+ MB


In [130]:
fiuba_3_vistas_int = fiuba_3_vistas.select_dtypes(include=['int'])
converted_int = fiuba_3_vistas_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_3_vistas_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_3_vistas_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

7.34 MB
3.67 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [131]:
optimized_fiuba_3_vistas = fiuba_3_vistas.copy()

optimized_fiuba_3_vistas[converted_int.columns] = converted_int

print(mem_usage(fiuba_3_vistas))
print(mem_usage(optimized_fiuba_3_vistas))

143.84 MB
140.17 MB


In [132]:
timestamp = optimized_fiuba_3_vistas.timestamp
print(mem_usage(timestamp))
timestamp.head()

77.97 MB


0    2018-02-23T13:38:13.187-0500
1    2018-02-23T13:38:14.296-0500
2    2018-02-23T13:38:14.329-0500
3    2018-02-23T13:38:17.921-0500
4    2018-02-23T13:38:18.973-0500
Name: timestamp, dtype: object

In [133]:
optimized_fiuba_3_vistas['timestamp'] = pd.to_datetime(timestamp,format='%Y-%m-%dT%H:%M:%S.%f')


print(mem_usage(optimized_fiuba_3_vistas))
optimized_fiuba_3_vistas.timestamp.head()

69.53 MB


0   2018-02-23 18:38:13.187
1   2018-02-23 18:38:14.296
2   2018-02-23 18:38:14.329
3   2018-02-23 18:38:17.921
4   2018-02-23 18:38:18.973
Name: timestamp, dtype: datetime64[ns]

In [134]:
print(mem_usage(fiuba_3_vistas))
print(mem_usage(optimized_fiuba_3_vistas))

143.84 MB
69.53 MB


In [137]:
dtypes = optimized_fiuba_3_vistas.drop('timestamp',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{'idAviso': 'uint32', 'idpostulante': 'object'}


In [31]:
optimized_fiuba_3_vistas.to_csv('Data/Vistas/fiuba_3_vistas-reduced.csv')

In [18]:
fiuba_3_vistasDesde15Abril = pd.read_csv('Data/Vistas/fiuba_3_vistasDesde15Abril.csv')

In [4]:
fiuba_3_vistasDesde15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11648230 entries, 0 to 11648229
Data columns (total 3 columns):
idAviso         int64
timestamp       object
idpostulante    object
dtypes: int64(1), object(2)
memory usage: 266.6+ MB


In [19]:
fiuba_3_vistasDesde15Abril_int = fiuba_3_vistasDesde15Abril.select_dtypes(include=['int'])
converted_int = fiuba_3_vistasDesde15Abril_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_3_vistasDesde15Abril_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_3_vistasDesde15Abril_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

88.87 MB
44.43 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [20]:
optimized_fiuba_3_vistasDesde15Abril = fiuba_3_vistasDesde15Abril.copy()

optimized_fiuba_3_vistasDesde15Abril[converted_int.columns] = converted_int

print(mem_usage(fiuba_3_vistasDesde15Abril))
print(mem_usage(optimized_fiuba_3_vistasDesde15Abril))

1735.56 MB
1691.13 MB


In [21]:
timestamp = optimized_fiuba_3_vistasDesde15Abril.timestamp
print(mem_usage(timestamp))
timestamp.head()

944.23 MB


0    2018-04-30T19:55:37.990-0400
1    2018-04-30T19:55:38.291-0400
2    2018-04-30T19:55:37.686-0400
3    2018-04-30T19:55:40.199-0400
4    2018-04-30T19:55:40.507-0400
Name: timestamp, dtype: object

In [22]:
optimized_fiuba_3_vistasDesde15Abril['timestamp'] = pd.to_datetime(timestamp,format='%Y-%m-%dT%H:%M:%S.%f')

print(mem_usage(optimized_fiuba_3_vistasDesde15Abril))
optimized_fiuba_3_vistasDesde15Abril.timestamp.head()

835.76 MB


0   2018-04-30 23:55:37.990
1   2018-04-30 23:55:38.291
2   2018-04-30 23:55:37.686
3   2018-04-30 23:55:40.199
4   2018-04-30 23:55:40.507
Name: timestamp, dtype: datetime64[ns]

In [23]:
print(mem_usage(fiuba_3_vistasDesde15Abril))
print(mem_usage(optimized_fiuba_3_vistasDesde15Abril))

1735.56 MB
835.76 MB


In [24]:
optimized_fiuba_3_vistasDesde15Abril.to_csv('Data/Vistas/fiuba_3_vistasDesde15Abril-reduced.csv')

In [3]:
fiuba_3_vistasHasta15Abril = pd.read_csv('Data/Vistas/fiuba_3_vistasHasta15Abril.csv')

In [4]:
fiuba_3_vistasHasta15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5758686 entries, 0 to 5758685
Data columns (total 3 columns):
idAviso         int64
timestamp       object
idpostulante    object
dtypes: int64(1), object(2)
memory usage: 131.8+ MB


In [5]:
fiuba_3_vistasHasta15Abril_int = fiuba_3_vistasHasta15Abril.select_dtypes(include=['int'])
converted_int = fiuba_3_vistasHasta15Abril_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_3_vistasHasta15Abril_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_3_vistasHasta15Abril_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

43.94 MB
21.97 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [6]:
optimized_fiuba_3_vistasHasta15Abril = fiuba_3_vistasHasta15Abril.copy()

optimized_fiuba_3_vistasHasta15Abril[converted_int.columns] = converted_int

print(mem_usage(fiuba_3_vistasHasta15Abril))
print(mem_usage(optimized_fiuba_3_vistasHasta15Abril))

861.11 MB
839.14 MB


In [7]:
timestamp = optimized_fiuba_3_vistasHasta15Abril.timestamp
print(mem_usage(timestamp))
timestamp.head()

466.81 MB


0    2018-04-06T01:43:47.445-0400
1    2018-04-06T01:43:48.330-0400
2    2018-04-06T01:43:50.042-0400
3    2018-04-06T01:43:51.171-0400
4    2018-04-06T01:43:46.371-0400
Name: timestamp, dtype: object

In [8]:
optimized_fiuba_3_vistasHasta15Abril['timestamp'] = pd.to_datetime(timestamp,format='%Y-%m-%dT%H:%M:%S.%f')

print(mem_usage(optimized_fiuba_3_vistasHasta15Abril))
optimized_fiuba_3_vistasHasta15Abril.timestamp.head()

416.27 MB


0   2018-04-06 05:43:47.445
1   2018-04-06 05:43:48.330
2   2018-04-06 05:43:50.042
3   2018-04-06 05:43:51.171
4   2018-04-06 05:43:46.371
Name: timestamp, dtype: datetime64[ns]

In [9]:
print(mem_usage(fiuba_3_vistasHasta15Abril))
print(mem_usage(optimized_fiuba_3_vistasHasta15Abril))

861.11 MB
416.27 MB


In [10]:
optimized_fiuba_3_vistasHasta15Abril.to_csv('Data/Vistas/fiuba_3_vistasHasta15Abril-reduced.csv')

# Verifico que no haya IDs nulos

In [31]:
fiuba_3_vistas['idpostulante'].isnull().any()

False

In [30]:
fiuba_3_vistasDesde15Abril['idpostulante'].isnull().any()

False

In [33]:
fiuba_3_vistasHasta15Abril['idpostulante'].isnull().any()

# Merge de CSVs de vistas

In [3]:
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Vistas'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
#frame.drop_duplicates(["idpostulante"],inplace=True) PUEDE REPETIRSE EL IDPOSTULANTE
frame.drop_duplicates(["idpostulante","idAviso"],inplace=True) #Elimino vistas repetidas
#Como el dataset a predecir no contiene el dato de horario de postulacion o vista decido
#que un postulante puede ver o no ver un aviso... sin darle importancia al horario
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Vistas/Vistas-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Vistas/fiuba_3_vistas-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Vistas/fiuba_3_vistasHasta15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Vistas/fiuba_3_vistasDesde15Abril-reduced.csv


# Postulaciones

In [4]:
fiuba_4_postulaciones = pd.read_csv('Data/Postulaciones/fiuba_4_postulaciones.csv')

In [5]:
fiuba_4_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3401623 entries, 0 to 3401622
Data columns (total 3 columns):
idaviso             int64
idpostulante        object
fechapostulacion    object
dtypes: int64(1), object(2)
memory usage: 77.9+ MB


In [6]:
fiuba_4_postulaciones_int = fiuba_4_postulaciones.select_dtypes(include=['int'])
converted_int = fiuba_4_postulaciones_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_4_postulaciones_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_4_postulaciones_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

25.95 MB
12.98 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [7]:
optimized_fiuba_4_postulaciones = fiuba_4_postulaciones.copy()

optimized_fiuba_4_postulaciones[converted_int.columns] = converted_int

print(mem_usage(fiuba_4_postulaciones))
print(mem_usage(optimized_fiuba_4_postulaciones))

479.50 MB
466.53 MB


In [8]:
fechapostulacion = optimized_fiuba_4_postulaciones.fechapostulacion
print(mem_usage(fechapostulacion))
fechapostulacion.head()

246.55 MB


0    2018-01-15 16:22:34
1    2018-02-06 09:04:50
2    2018-02-22 09:04:47
3    2018-02-22 09:04:59
4    2018-01-25 18:55:03
Name: fechapostulacion, dtype: object

In [9]:
optimized_fiuba_4_postulaciones['fechapostulacion'] = pd.to_datetime(fechapostulacion,format='%Y-%m-%d %H:%M:%S')

print(mem_usage(optimized_fiuba_4_postulaciones))
optimized_fiuba_4_postulaciones.fechapostulacion.head()

245.93 MB


0   2018-01-15 16:22:34
1   2018-02-06 09:04:50
2   2018-02-22 09:04:47
3   2018-02-22 09:04:59
4   2018-01-25 18:55:03
Name: fechapostulacion, dtype: datetime64[ns]

In [10]:
print(mem_usage(fiuba_4_postulaciones))
print(mem_usage(optimized_fiuba_4_postulaciones))

479.50 MB
245.93 MB


In [11]:
dtypes = optimized_fiuba_4_postulaciones.drop('fechapostulacion',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{'idaviso': 'uint32', 'idpostulante': 'object'}


In [21]:
optimized_fiuba_4_postulaciones.to_csv('Data/Postulaciones/fiuba_4_postulaciones-reduced.csv')

In [36]:
fiuba_4_postulacionesHasta15Abril = pd.read_csv('Data/Postulaciones/fiuba_4_postulacionesHasta15Abril.csv')

In [37]:
fiuba_4_postulacionesHasta15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4909641 entries, 0 to 4909640
Data columns (total 3 columns):
idaviso             int64
idpostulante        object
fechapostulacion    object
dtypes: int64(1), object(2)
memory usage: 112.4+ MB


In [38]:
fiuba_4_postulacionesHasta15Abril_int = fiuba_4_postulacionesHasta15Abril.select_dtypes(include=['int'])
converted_int = fiuba_4_postulacionesHasta15Abril_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_4_postulacionesHasta15Abril_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_4_postulacionesHasta15Abril_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

37.46 MB
18.73 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [39]:
optimized_fiuba_4_postulacionesHasta15Abril = fiuba_4_postulacionesHasta15Abril.copy()

optimized_fiuba_4_postulacionesHasta15Abril[converted_int.columns] = converted_int

print(mem_usage(fiuba_4_postulacionesHasta15Abril))
print(mem_usage(optimized_fiuba_4_postulacionesHasta15Abril))

691.98 MB
673.25 MB


In [40]:
fechapostulacion = optimized_fiuba_4_postulacionesHasta15Abril.fechapostulacion
print(mem_usage(fechapostulacion))
fechapostulacion.head()

355.85 MB


0    2018-01-19 07:39:16
1    2018-01-24 15:07:39
2    2018-01-24 15:20:10
3    2018-01-26 08:37:04
4    2018-01-30 13:35:48
Name: fechapostulacion, dtype: object

In [41]:
optimized_fiuba_4_postulacionesHasta15Abril['fechapostulacion'] = pd.to_datetime(fechapostulacion,format='%Y-%m-%d %H:%M:%S')

print(mem_usage(optimized_fiuba_4_postulacionesHasta15Abril))
optimized_fiuba_4_postulacionesHasta15Abril.fechapostulacion.head()

354.86 MB


0   2018-01-19 07:39:16
1   2018-01-24 15:07:39
2   2018-01-24 15:20:10
3   2018-01-26 08:37:04
4   2018-01-30 13:35:48
Name: fechapostulacion, dtype: datetime64[ns]

In [42]:
print(mem_usage(fiuba_4_postulacionesHasta15Abril))
print(mem_usage(optimized_fiuba_4_postulacionesHasta15Abril))

691.98 MB
354.86 MB


In [44]:
dtypes = optimized_fiuba_4_postulacionesHasta15Abril.drop('fechapostulacion',axis=1).dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{'idaviso': 'uint32', 'idpostulante': 'object'}


In [10]:
optimized_fiuba_4_postulacionesHasta15Abril.to_csv('Data/Postulaciones/fiuba_4_postulacionesHasta15Abril-reduced.csv')

# Verifico que no haya IDs nulos

In [8]:
fiuba_4_postulaciones['idpostulante'].isnull().any()

False

In [10]:
fiuba_4_postulacionesHasta15Abril['idpostulante'].isnull().any()

False

# Merge de CSVs de postulaciones

In [19]:
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulaciones'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
#frame.drop_duplicates(["idpostulante"],inplace=True) UN POSTULANTE PUEDE APARECER VARIAS VECES
frame.drop_duplicates(["idpostulante","idaviso"],inplace=True) #Elimino postulaciones repetidas
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulaciones/Postulaciones-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulaciones/fiuba_4_postulaciones-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Postulaciones/fiuba_4_postulacionesHasta15Abril-reduced.csv


# Avisos online

In [12]:
fiuba_5_avisos_online = pd.read_csv('Data/fiuba_5_avisos_online.csv')

In [13]:
fiuba_5_avisos_onlineHasta15Abril = pd.read_csv('Data/fiuba_5_avisos_onlineHasta15Abril.csv')

# Verifico que no haya IDs nulos

In [16]:
fiuba_5_avisos_online['idaviso'].isnull().any()

False

In [17]:
fiuba_5_avisos_onlineHasta15Abril['idaviso'].isnull().any()

False

# Merge de CSVs de avisos online

In [5]:
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Online'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
#frame.drop_duplicates(["idaviso"],inplace=True) PUEDE HABER AVISOS REPETIDOS
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Online/Avisos-Online-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Online/fiuba_5_avisos_online.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Online/fiuba_5_avisos_onlineHasta15Abril.csv


# Avisos detalle

In [80]:
fiuba_6_avisos_detalle = pd.read_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalle.csv')

In [81]:
fiuba_6_avisos_detalle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13534 entries, 0 to 13533
Data columns (total 11 columns):
idaviso                 13534 non-null int64
idpais                  13534 non-null int64
titulo                  13534 non-null object
descripcion             13534 non-null object
nombre_zona             13534 non-null object
ciudad                  47 non-null object
mapacalle               872 non-null object
tipo_de_trabajo         13534 non-null object
nivel_laboral           13534 non-null object
nombre_area             13534 non-null object
denominacion_empresa    13529 non-null object
dtypes: int64(2), object(9)
memory usage: 1.1+ MB


In [82]:
for dtype in ['int','object']:
    selected_dtype = fiuba_6_avisos_detalle.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 0.07 MB
Average memory usage for object columns: 2.53 MB


In [83]:
fiuba_6_avisos_detalle_obj = fiuba_6_avisos_detalle.select_dtypes(include=['object']).copy()
fiuba_6_avisos_detalle_obj.describe()

Unnamed: 0,titulo,descripcion,nombre_zona,ciudad,mapacalle,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa
count,13534,13534,13534,47,872,13534,13534,13534,13529
unique,11650,12674,4,18,487,9,5,173,2592
top,Analistas de Testing Ssr/Sr con Automatización...,<p>Nos encontramos en la búsqueda de un ANALIS...,Gran Buenos Aires,Buenos Aires,SARMIENTO 1937,Full-time,Senior / Semi-Senior,Ventas,RANDSTAD
freq,22,19,12654,14,43,12339,9407,1659,562


In [119]:
converted_obj = pd.DataFrame()

for col in ['nombre_zona','ciudad','tipo_de_trabajo', 'nivel_laboral']:
    num_unique_values = len(fiuba_6_avisos_detalle_obj[col].unique())
    num_total_values = len(fiuba_6_avisos_detalle_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalle_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalle_obj[col]

In [120]:
print(mem_usage(fiuba_6_avisos_detalle_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_6_avisos_detalle_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

25.42 MB
0.06 MB


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  after removing the cwd from sys.path.


Unnamed: 0,before,after
object,9.0,
category,,1.0
category,,1.0
category,,1.0
category,,1.0


In [121]:
fiuba_6_avisos_detalle_int = fiuba_6_avisos_detalle.select_dtypes(include=['int']).copy()
fiuba_6_avisos_detalle_int.describe()

Unnamed: 0,idaviso,idpais
count,13534.0,13534.0
mean,1110816000.0,1.0
std,32358170.0,0.0
min,1585950.0,1.0
25%,1112222000.0,1.0
50%,1112279000.0,1.0
75%,1112339000.0,1.0
max,1112389000.0,1.0


In [122]:
fiuba_6_avisos_detalle_int = fiuba_6_avisos_detalle.select_dtypes(include=['int'])
converted_int = fiuba_6_avisos_detalle_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_6_avisos_detalle_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_6_avisos_detalle_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

0.21 MB
0.06 MB


Unnamed: 0,before,after
uint8,,1.0
uint32,,1.0
int64,2.0,


In [123]:
optimized_fiuba_6_avisos_detalle = fiuba_6_avisos_detalle.copy()

optimized_fiuba_6_avisos_detalle[converted_int.columns] = converted_int
optimized_fiuba_6_avisos_detalle[converted_obj.columns] = converted_obj

print(mem_usage(fiuba_6_avisos_detalle))
print(mem_usage(optimized_fiuba_6_avisos_detalle))

25.62 MB
22.34 MB


In [124]:
dtypes = converted_obj.dtypes

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

# rather than print all 161 items, we'll
# sample 10 key/value pairs from the dict
# and print it nicely using prettyprint

preview = first2pairs = {key:value for key,value in list(column_types.items())[:10]}
import pprint
pp = pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)

{   'ciudad': 'category',
    'nivel_laboral': 'category',
    'nombre_zona': 'category',
    'tipo_de_trabajo': 'category'}


In [21]:
optimized_fiuba_6_avisos_detalle.to_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalle-reduced.csv')

In [26]:
fiuba_6_avisos_detalle_missing_nivel_laboral = pd.read_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalle_missing_nivel_laboral.csv')

In [24]:
fiuba_6_avisos_detalle_missing_nivel_laboral.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 11 columns):
idaviso                 338 non-null int64
idpais                  338 non-null int64
titulo                  338 non-null object
descripcion             338 non-null object
nombre_zona             338 non-null object
ciudad                  28 non-null object
mapacalle               271 non-null object
tipo_de_trabajo         338 non-null object
nivel_laboral           3 non-null object
nombre_area             338 non-null object
denominacion_empresa    338 non-null object
dtypes: int64(2), object(9)
memory usage: 29.1+ KB


In [27]:
for dtype in ['int','object']:
    selected_dtype = fiuba_6_avisos_detalle_missing_nivel_laboral.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 0.00 MB
Average memory usage for object columns: 0.07 MB


In [28]:
fiuba_6_avisos_detalle_missing_nivel_laboral_obj = fiuba_6_avisos_detalle_missing_nivel_laboral.select_dtypes(include=['object']).copy()
fiuba_6_avisos_detalle_missing_nivel_laboral_obj.describe()

Unnamed: 0,titulo,descripcion,nombre_zona,ciudad,mapacalle,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa
count,338,338,338,28,271,338,3,338,338
unique,332,336,6,9,64,5,1,82,102
top,Telefonista vendedores,"<p style="""">Requisitos:<br />Edad: 21 a 40 año...",Capital Federal,Buenos Aires,Capital Federal,Full-time,Senior / Semi-Senior,Ventas,Gestion Compartida S.A.
freq,2,2,268,9,107,308,3,28,28


In [29]:
converted_obj = pd.DataFrame()

for col in fiuba_6_avisos_detalle_missing_nivel_laboral_obj.columns:
    num_unique_values = len(fiuba_6_avisos_detalle_missing_nivel_laboral_obj[col].unique())
    num_total_values = len(fiuba_6_avisos_detalle_missing_nivel_laboral_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalle_missing_nivel_laboral_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalle_missing_nivel_laboral_obj[col]

In [30]:
print(mem_usage(fiuba_6_avisos_detalle_missing_nivel_laboral_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_6_avisos_detalle_missing_nivel_laboral_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

1.15 MB
1.04 MB


Unnamed: 0,before,after
object,9.0,2
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1


In [31]:
fiuba_6_avisos_detalle_missing_nivel_laboral_int = fiuba_6_avisos_detalle_missing_nivel_laboral.select_dtypes(include=['int']).copy()
fiuba_6_avisos_detalle_missing_nivel_laboral_int.describe()

Unnamed: 0,idaviso,idpais
count,338.0,338.0
mean,1112038000.0,1.0
std,6049318.0,0.0
min,1001284000.0,1.0
25%,1112416000.0,1.0
50%,1112431000.0,1.0
75%,1112455000.0,1.0
max,1112471000.0,1.0


In [32]:
fiuba_6_avisos_detalle_missing_nivel_laboral_int = fiuba_6_avisos_detalle_missing_nivel_laboral_int.select_dtypes(include=['int'])
converted_int = fiuba_6_avisos_detalle_missing_nivel_laboral_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_6_avisos_detalle_missing_nivel_laboral_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_6_avisos_detalle_missing_nivel_laboral_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

0.01 MB
0.00 MB


Unnamed: 0,before,after
uint8,,1.0
uint32,,1.0
int64,2.0,


In [33]:
optimized_fiuba_6_avisos_detalle_missing_nivel_laboral = fiuba_6_avisos_detalle_missing_nivel_laboral.copy()

optimized_fiuba_6_avisos_detalle_missing_nivel_laboral[converted_int.columns] = converted_int
optimized_fiuba_6_avisos_detalle_missing_nivel_laboral[converted_obj.columns] = converted_obj

print(mem_usage(fiuba_6_avisos_detalle_missing_nivel_laboral))
print(mem_usage(optimized_fiuba_6_avisos_detalle_missing_nivel_laboral))

1.15 MB
1.04 MB


In [34]:
optimized_fiuba_6_avisos_detalle_missing_nivel_laboral.to_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalle_missing_nivel_laboral-reduced.csv')

In [3]:
fiuba_6_avisos_detalleDesde15Abril = pd.read_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalleDesde15Abril.csv')

In [4]:
fiuba_6_avisos_detalleDesde15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13798 entries, 0 to 13797
Data columns (total 11 columns):
idaviso                 13798 non-null int64
idpais                  13798 non-null int64
titulo                  13798 non-null object
descripcion             13798 non-null object
nombre_zona             13798 non-null object
ciudad                  86 non-null object
mapacalle               1019 non-null object
tipo_de_trabajo         13798 non-null object
nivel_laboral           13798 non-null object
nombre_area             13798 non-null object
denominacion_empresa    13795 non-null object
dtypes: int64(2), object(9)
memory usage: 1.2+ MB


In [5]:
for dtype in ['int','object']:
    selected_dtype = fiuba_6_avisos_detalleDesde15Abril.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 0.07 MB
Average memory usage for object columns: 2.57 MB


In [6]:
fiuba_6_avisos_detalleDesde15Abril_obj = fiuba_6_avisos_detalleDesde15Abril.select_dtypes(include=['object']).copy()
fiuba_6_avisos_detalleDesde15Abril_obj.describe()

Unnamed: 0,titulo,descripcion,nombre_zona,ciudad,mapacalle,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa
count,13798,13798,13798,86,1019,13798,13798,13798,13795
unique,12010,13134,15,37,591,9,5,182,2951
top,Analista Contable,"<p style=""""><strong><em style=""""><u><span styl...",Gran Buenos Aires,Buenos Aires,SARMIENTO 1937,Full-time,Senior / Semi-Senior,Ventas,RANDSTAD
freq,30,11,12603,16,48,12282,9157,1766,505


In [7]:
converted_obj = pd.DataFrame()

for col in fiuba_6_avisos_detalleDesde15Abril_obj.columns:
    num_unique_values = len(fiuba_6_avisos_detalleDesde15Abril_obj[col].unique())
    num_total_values = len(fiuba_6_avisos_detalleDesde15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalleDesde15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalleDesde15Abril_obj[col]

In [8]:
print(mem_usage(fiuba_6_avisos_detalleDesde15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_6_avisos_detalleDesde15Abril_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

43.87 MB
38.51 MB


Unnamed: 0,before,after
object,9.0,2
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1


In [9]:
fiuba_6_avisos_detalleDesde15Abril_int = fiuba_6_avisos_detalleDesde15Abril.select_dtypes(include=['int']).copy()
fiuba_6_avisos_detalleDesde15Abril_int.describe()

Unnamed: 0,idaviso,idpais
count,13798.0,13798.0
mean,1097216000.0,1.0
std,126150200.0,0.0
min,739260.0,1.0
25%,1112160000.0,1.0
50%,1112405000.0,1.0
75%,1112463000.0,1.0
max,1112515000.0,1.0


In [10]:
fiuba_6_avisos_detalleDesde15Abril_int = fiuba_6_avisos_detalleDesde15Abril_int.select_dtypes(include=['int'])
converted_int = fiuba_6_avisos_detalleDesde15Abril_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_6_avisos_detalleDesde15Abril_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_6_avisos_detalleDesde15Abril_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

0.21 MB
0.07 MB


Unnamed: 0,before,after
uint8,,1.0
uint32,,1.0
int64,2.0,


In [11]:
optimized_fiuba_6_avisos_detalleDesde15Abril = fiuba_6_avisos_detalleDesde15Abril.copy()

optimized_fiuba_6_avisos_detalleDesde15Abril[converted_int.columns] = converted_int
optimized_fiuba_6_avisos_detalleDesde15Abril[converted_obj.columns] = converted_obj

print(mem_usage(fiuba_6_avisos_detalleDesde15Abril))
print(mem_usage(optimized_fiuba_6_avisos_detalleDesde15Abril))

44.08 MB
38.57 MB


In [24]:
optimized_fiuba_6_avisos_detalleDesde15Abril.to_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalleDesde15Abril-reduced.csv')

In [13]:
fiuba_6_avisos_detalleHasta15Abril = pd.read_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalleHasta15Abril.csv')

In [14]:
fiuba_6_avisos_detalleHasta15Abril.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18299 entries, 0 to 18298
Data columns (total 11 columns):
idaviso                 18299 non-null int64
idpais                  18299 non-null int64
titulo                  18299 non-null object
descripcion             18299 non-null object
nombre_zona             18299 non-null object
ciudad                  64 non-null object
mapacalle               1282 non-null object
tipo_de_trabajo         18299 non-null object
nivel_laboral           18299 non-null object
nombre_area             18299 non-null object
denominacion_empresa    18293 non-null object
dtypes: int64(2), object(9)
memory usage: 1.5+ MB


In [15]:
for dtype in ['int','object']:
    selected_dtype = fiuba_6_avisos_detalleHasta15Abril.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 0.09 MB
Average memory usage for object columns: 3.42 MB


In [16]:
fiuba_6_avisos_detalleHasta15Abril_obj = fiuba_6_avisos_detalleHasta15Abril.select_dtypes(include=['object']).copy()
fiuba_6_avisos_detalleHasta15Abril_obj.describe()

Unnamed: 0,titulo,descripcion,nombre_zona,ciudad,mapacalle,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa
count,18299,18299,18299,64,1282,18299,18299,18299,18293
unique,15337,16992,3,21,633,10,5,186,3279
top,Analista Contable,<p>En <strong>SOLUTIX</strong> seleccionamos <...,Gran Buenos Aires,Argentina,SARMIENTO 1937,Full-time,Senior / Semi-Senior,Ventas,RANDSTAD
freq,32,18,17016,22,98,16769,12713,2268,774


In [17]:
converted_obj = pd.DataFrame()

for col in fiuba_6_avisos_detalleHasta15Abril_obj.columns:
    num_unique_values = len(fiuba_6_avisos_detalleHasta15Abril_obj[col].unique())
    num_total_values = len(fiuba_6_avisos_detalleHasta15Abril_obj[col])
    if num_unique_values / num_total_values < 0.5:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalleHasta15Abril_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = fiuba_6_avisos_detalleHasta15Abril_obj[col]

In [18]:
print(mem_usage(fiuba_6_avisos_detalleHasta15Abril_obj))
print(mem_usage(converted_obj))

compare_obj = pd.concat([fiuba_6_avisos_detalleHasta15Abril_obj.dtypes,converted_obj.dtypes],axis=1)
compare_obj.columns = ['before','after']
compare_obj.apply(pd.Series.value_counts)

58.43 MB
51.28 MB


Unnamed: 0,before,after
object,9.0,2
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1
category,,1


In [19]:
fiuba_6_avisos_detalleHasta15Abril_int = fiuba_6_avisos_detalleHasta15Abril.select_dtypes(include=['int']).copy()
fiuba_6_avisos_detalleHasta15Abril_int.describe()

Unnamed: 0,idaviso,idpais
count,18299.0,18299.0
mean,1111665000.0,1.0
std,17552090.0,0.0
min,8725750.0,1.0
25%,1112257000.0,1.0
50%,1112336000.0,1.0
75%,1112403000.0,1.0
max,1112471000.0,1.0


In [20]:
fiuba_6_avisos_detalleHasta15Abril_int = fiuba_6_avisos_detalleHasta15Abril_int.select_dtypes(include=['int'])
converted_int = fiuba_6_avisos_detalleHasta15Abril_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(fiuba_6_avisos_detalleHasta15Abril_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([fiuba_6_avisos_detalleHasta15Abril_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

0.28 MB
0.09 MB


Unnamed: 0,before,after
uint8,,1.0
uint32,,1.0
int64,2.0,


In [21]:
optimized_fiuba_6_avisos_detalleHasta15Abril = fiuba_6_avisos_detalleHasta15Abril.copy()

optimized_fiuba_6_avisos_detalleHasta15Abril[converted_int.columns] = converted_int
optimized_fiuba_6_avisos_detalleHasta15Abril[converted_obj.columns] = converted_obj

print(mem_usage(fiuba_6_avisos_detalleHasta15Abril))
print(mem_usage(optimized_fiuba_6_avisos_detalleHasta15Abril))

58.70 MB
51.36 MB


In [23]:
optimized_fiuba_6_avisos_detalleHasta15Abril.to_csv('Data/Avisos-Detalle/fiuba_6_avisos_detalleHasta15Abril-reduced.csv')

# Verifico que no haya IDs nulos

In [27]:
fiuba_6_avisos_detalle['idaviso'].isnull().any()

False

In [31]:
fiuba_6_avisos_detalle_missing_nivel_laboral['idaviso'].isnull().any()

False

In [32]:
fiuba_6_avisos_detalleDesde15Abril['idaviso'].isnull().any()

False

In [34]:
fiuba_6_avisos_detalleHasta15Abril['idaviso'].isnull().any()

False

# Mergeo de CSVS de avisos detalle

In [35]:
path =r'/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle'
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    print (file_)
    df = pd.read_csv(file_,low_memory=False)
    list_.append(df)
frame = pd.concat(list_)
frame.drop_duplicates(["idaviso"],inplace=True) #NO PUEDE HABER AVISOS REPETIDOS, por algo es unico un id
frame.to_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle/Avisos-Detalle-merge.csv',sep=",",index=False)

/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle/fiuba_6_avisos_detalleHasta15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle/fiuba_6_avisos_detalleDesde15Abril-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle/fiuba_6_avisos_detalle-reduced.csv
/home/lucio/Documentos/Datos/NaventDatosTP/Data/Avisos-Detalle/fiuba_6_avisos_detalle_missing_nivel_laboral-reduced.csv
