In [None]:
def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size

In [None]:
import pandas as pd
import pickle
import re
import sys
from sklearn.preprocessing import Binarizer

def clean_name(name):
    name = re.sub(r'[^\w]', ' ', name)
    name = re.sub(r'[,:._\-\[\]\d]', ' ', name)
    return name.lower()


test = pd.read_parquet('data_fusion_train.parquet')
test = test[test.category_id == -1]
test['item_name'] = test['item_name'].apply(clean_name)

freq = pickle.load(open('submit_names/tfidf', 'rb'))
clf = pickle.load(open('submit_names/clf_task1', 'rb'))

print('Load is completed...')

X_test = freq.transform(test.item_name)
onehot = Binarizer()
X_test = onehot.fit_transform(X_test)

print('Text vectorize is completed')

In [None]:
pred = clf.predict(X_test)