In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from toshl_fixer.common import DATA_DIR, csv_name


In [16]:
def vectorize_desc(data):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(data)
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    return pd.DataFrame(denselist, columns=feature_names)

def create_vector(data):
    df = vectorize_desc(data['desc'])
    df['amount'] = data['amount'] 
    return df


In [17]:
training_data = pd.read_csv(DATA_DIR / csv_name(from_date="2019-12-01", to_date="2019-12-31"))
test_data = pd.read_csv(DATA_DIR / csv_name(from_date="2019-11-01", to_date="2019-11-31"))
train_len = len(training_data)
data = pd.concat([training_data, test_data], ignore_index=True, sort=False)
data = data.fillna('missing')
data['cat/tag'] = data.category.str.cat(data.tag, sep='/')
del data['id']
del data['category']	
del data['tag']

In [18]:
vec = create_vector(data)
training_vec = vec.iloc[:train_len]
training_data = data.iloc[:train_len]
test_vec = vec.iloc[train_len:]
test_data = data.iloc[train_len:]
test_data

Unnamed: 0,desc,amount,date,cat/tag
128,הפניקס אוטו חובה,-702.00,2019-12-01,Basics/Charity
129,WIND MOBILITY GMBH BERLIN DE,-35.50,2019-12-01,Transport/scooters
130,"טיב טעם יהודה המכבי ת""א",-110.33,2019-12-01,Basics/groceries
131,קיוסק יהודה מכבי 55 תל אב,-32.00,2019-12-01,Basics/groceries
132,רובין הוד,-20.00,2019-12-01,Leisure/apps
...,...,...,...,...
301,"זארה רמת אביב ת""א",-341.50,2019-11-01,Other/missing
302,"טיב טעם יהודה המכבי ת""א",-18.50,2019-11-01,Basics/groceries
303,"סופר זול בן גוריון בע""מ-ג",-30.00,2019-11-01,Other/missing
304,עמל.ערוץ יש 7,-11.55,2019-11-01,Health & Personal Care/missing


In [19]:
from sklearn import tree
X = training_vec
Y = training_data['cat/tag']
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

In [22]:
for index, row in test_vec.iterrows():
    test_data.at[index, 'label'] = clf.predict(row.values.reshape(1, -1))[0]
	
test_data[test_data['label'] != test_data['cat/tag']]
# test_vec.iloc[1].values

Unnamed: 0,desc,amount,date,cat/tag,label
137,ח-ן שרות סלקום,-205.49,2019-11-29,Home & Utilities/phone,Basics/groceries
138,AMZN MKTP US*G,-219.61,2019-11-29,Other/missing,Basics/groceries
139,AMZN Mktp US*G15RS21Z3,-219.61,2019-11-29,Other/missing,Basics/groceries
141,פייפר עודפים,-829.26,2019-11-29,Clothing/clothes,Education/missing
142,חופית YELLOW / פז,-275.04,2019-11-29,Transport/car,Other/missing
...,...,...,...,...,...
299,הפניקס אוטו חובה,-702.00,2019-11-01,unsorted/missing,Basics/Charity
300,אספרסו בר,-224.00,2019-11-01,Leisure/Eating out,Basics/groceries
301,"זארה רמת אביב ת""א",-341.50,2019-11-01,Other/missing,Clothing/clothes
304,עמל.ערוץ יש 7,-11.55,2019-11-01,Health & Personal Care/missing,Taxes & Insurance/bank fees
