In [None]:
import json

import pandas as pd
import numpy as np
from gensim.corpora import dictionary
from sklearn.svm import SVC

from random import sample

import re
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from pymorphy2 import MorphAnalyzer

In [None]:
from utils import get_text_processor

In [None]:
hash_tags_regex = '#[^#\s]*'
hash_tags_tokenizer = RegexpTokenizer(hash_tags_regex, gaps=False)

text_processing = get_text_processor()

def get_tags_and_process_text(text):
    tokens = hash_tags_tokenizer.tokenize(text)
    other = re.sub(hash_tags_regex, '', text)
    return tokens, list(text_processing(other))

In [None]:
def do_all_with_line(line):
    doc = json.loads(line)
    
    res = get_tags_and_process_text(doc['text'])
    
    is_trash_predicted = len(res[-1])<20# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
    
    return None if is_trash_predicted else (doc['img_url'], res)

In [None]:
from joblib import Parallel, delayed

import io

In [None]:
%%time
with open('../data/up_sample_gena.json') as income:
    data = Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))

In [None]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(Parallel(n_jobs=2)(delayed(do_all_with_line)(line) for line in tqdm_notebook(income))):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))

In [None]:
import itertools as it

from tqdm import tqdm_notebook

In [None]:
%%time
with open('../data/users_photos.full.backup.json') as income,\
        io.open('../data/big_clean_data.json', 'w', encoding='utf8') as outcome:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            outcome.write(u'%s\n'%json.dumps(line))

In [None]:
import io

In [None]:
import h5py

In [None]:
with io.open('../data/big_clean_data.json') as income:
    long_enough = map(json.loads, income)

In [None]:
len(long_enough)

In [None]:
with open('../data/big_cleaned_andlenght-filtered.json') as income:
    df = pd.DataFrame(map(json.loads, income))

with h5py.File("../data/img_url2inception.backup.h5", 'r') as hdf5_inception_dreams:
    %time df['classes'] = df.img_url.apply(hdf5_inception_dreams.get).apply(np.array) # Aware! Random disc access!

df.dropna(axis=0, subset=['classes'], inplace=True)

In [None]:
interesting = set(df.img_url)

In [None]:
filtered_raw = filter(lambda row: row[0] in interesting, long_enough)

In [None]:
len(filtered_raw)

In [None]:
long_enough = filtered_raw

In [None]:
def do_all_with_line(line):
    doc = json.loads(line)
    
    is_trash_predicted = doc['img_url'] not in interesting# or model.predict(np.asmatrix(to_vector(gd.doc2bow(res[-1]))))
    
    return None if is_trash_predicted else doc

In [None]:
from tqdm import tqdm_notebook
import itertools as it

In [None]:
%%time

res = []

with open('../data/users_photos.full.backup.json') as income:
    for line in tqdm_notebook(it.imap(do_all_with_line, income)):
        if line is not None:
            res.append(line)

In [None]:
alt_long_enouth = map(lambda doc: (doc['img_url'], get_tags_and_process_text(doc['text'])), tqdm_notebook(res))

In [None]:
long_enough = alt_long_enouth

In [None]:
# long_enough = [d for d in data if len(d[-1][-1]) > 33]

long_enough_df = pd.DataFrame(map(list, zip(*long_enough)[-1]))

long_enough_df.index=zip(*long_enough)[0]
long_enough_df.columns=['tag', 'text']

long_enough_df = long_enough_df.reset_index().drop_duplicates('index').set_index('index')

In [None]:
gd = dictionary.Dictionary(documents=long_enough_df.text)

gd.filter_extremes()

gd.compactify()

In [None]:
goods, bads = set(), set()

In [None]:
def add_to(name, setting, what):
    print 'adding "%s"' % what.strip()
    setting.add(what.strip())
    print 'length of %s is %i'%(name, len(setting))

In [None]:
for u, d in sample(long_enough, 1):
    print u, ', '.join(d[-1])
    print 

In [None]:
add_to('bads', bads, u)

In [None]:
add_to('goods', goods, u)

In [None]:
def to_vector(bow):
    bow = dict(bow)
    return [bow.get(_, 0) for _ in range(len(gd)+1)]

In [None]:
model = SVC(kernel='linear', probability=True, random_state=42)

In [None]:
model.fit(np.array(map(list, long_enough_df.loc[bads].append(long_enough_df.loc[goods])\
                       .text.apply(gd.doc2bow).apply(to_vector))), [1]*len(bads)+[0]*len(goods))

In [None]:
preds = long_enough_df.text.apply(gd.doc2bow).apply(to_vector).apply(lambda x: model.predict(np.matrix(x)))

In [None]:
add_to('goods', goods, 'https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg')

In [None]:
add_to('bads', bads, 'https://pp.vk.me/c836121/v836121322/1674e/mz9wrbfnuYU.jpg')

In [None]:
sum(preds.apply(lambda x: x[0]) < 0.5)

In [None]:
long_enough_df.loc[preds.apply(lambda x: x[0]) < 0.5].sample(replace=True, n=10)

In [None]:
print ', '.join(long_enough_df.loc['https://pp.vk.me/c222/v222237/2ec/VuHMGBXxSJ4.jpg'].text)

In [None]:
with open('../data/bads', 'w') as outcome:
    for bad in bads:
        outcome.write('%s\n'%bad)

In [None]:
with open('../data/bads') as income:
    bads = set(map(str.strip, income))


In [None]:
with open('../data/goods', 'w') as outcome:
    for good in goods:
        outcome.write('%s\n'%good)

In [None]:
with open('../data/goods') as income:
    goods = set(map(str.strip, income))


In [None]:
import io

In [None]:
with io.open('../data/big_cleaned_andlenght-filtered.json', 'w', encoding='utf8') as outcome:
    for u, data in long_enough_df.iterrows():
        ans = data.to_dict()
        ans['img_url'] = u
        outcome.write(u'%s\n'% json.dumps(ans))