#### Imports


In [None]:
import numpy as np 
import pandas as pd

import os

In [None]:
!pip install pymystem3



In [None]:
import nltk
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib

from scipy.sparse import hstack
from scipy.sparse import csr_matrix

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




True

#### Загрузка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
 path = "/content/drive/MyDrive/Inprac_data/raw_data"

In [None]:
pd_categorized_data = pd.read_csv(path + "/train.csv")
target = pd_categorized_data.category_id.copy()
del pd_categorized_data['category_id']


#### TF-IDF

In [None]:
stop_words = stopwords.words("russian")

def tokenize(text):
    
    if isinstance(text, np.ndarray):
      # np array
      
      str_text = ''
      for s in text:
        str_text = str_text + s
      text = str_text
    

    tokens = [token for token in text.split() if token not in stop_words and token != " " \
                      and token.strip() not in punctuation]
    return tokens


In [None]:
path_to_prepr_data = "/content/drive/MyDrive/Inprac_data"
prepr_queries_data = pd.read_csv(path_to_prepr_data + "/prepr_queries.csv", header = None)
prepr_descriptions_data = pd.read_csv(path_to_prepr_data + "/prepr_descriptions.csv", header = None,\
                                      sep ='<<', engine = 'python')
prepr_queries_data['descr'] = prepr_descriptions_data[0]
prepr_queries_data['query'] = prepr_queries_data[0]
prepr_queries_data['category_id'] = target

#### Train

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    prepr_queries_data[['query', 'descr']].values, target, test_size=0.2, random_state=42)

#### Pipeline


In [None]:


vectorizer = TfidfVectorizer(tokenizer=tokenize, lowercase=False)
model = LinearSVC()
pipeline = make_pipeline(vectorizer, model)


#### FIT


In [None]:
pipeline.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x7fa0677b0050>,
                                 use_idf=True, vocabulary=None)),
                ('linearsvc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_in

In [None]:
prediction = pipeline.predict(X_valid)
acc = accuracy_score(prediction, y_valid)

In [None]:
print(acc)

0.8651229776107207


In [None]:

joblib.dump(pipeline, path + '/pipeline.pkl', compress=1)

['/content/drive/MyDrive/Inprac_data/raw_data/pipeline.pkl']

#### эвристика

In [None]:
with open(path + '/pipeline.pkl','rb') as g:
  class_model = joblib.load(g)

In [None]:
def get_class(resp):
  return resp.category_id


In [None]:
### 
# path_to_category = 
# def get_class(resp):
#   pd_categorized_data = pd.read_csv(path_to_category)
#   classes = pd_categorized_data.category_id.copy()

#   return classes.iloc[resp.index]

In [None]:
def ev_heurist(query, responses):
  res_size = responses.shape[0] # количество доков
  grades = np.arange(1, res_size + 1)[::-1]
  
  max_grade = res_size * (res_size + 1) / 2
  q_class_pred = class_model.predict(query)

  resp_class = get_class(responses) # get class method

  is_eq = (resp_class == q_class_pred * np.ones(res_size))


  score = grades[is_eq].sum() / max_grade

  return min(1, score * (1. / self.accuracy))


In [None]:
q_data = pd.read_csv(path_to_prepr_data + "/prepr_queries.csv", header = None)
descr_data = pd.read_csv(path_to_prepr_data + "/prepr_descriptions.csv", header = None,\
                                      sep ='<<', engine = 'python')
q_data['descr'] = descr_data[0]
q_data['category_id'] = target

In [None]:
import random

real_size = q_data.shape[0]

idx = random.randint(0, int(real_size))
rand_query = q_data.iloc[[idx]][0]
real_cat = class_model.predict(rand_query)[0]

arr = np.arange(10)
ind_true_cat = np.array(q_data[q_data.category_id == real_cat].head(10).index)
indeces = np.append(arr, ind_true_cat)
np.random.shuffle(indeces)



In [None]:
ev_heurist(rand_query, q_data.iloc[indeces]) # половина неподходящих запросов

0.5095238095238095

In [None]:
arr_2 = np.arange(5)                      # треть неподходящих запросов
indeces_2 = np.append(arr_2, ind_true_cat)
np.random.shuffle(indeces_2)
ev_heurist(rand_query, q_data.iloc[indeces_2])

0.7083333333333334

#### smart search

In [None]:
from sklearn.externals import joblib
 # замени, пож, тем путем, где первоначальная табличка с номерами категорий
path_to_category = "/content/drive/MyDrive/sharing/datasets/avito/raw_data/train.csv"
path_to_class_model =  "/content/drive/MyDrive/sharing/models/pipeline_class.pkl"

In [None]:
pd_categorized_data = pd.read_csv(path_to_category)
classes = pd_categorized_data.category_id.copy()

In [None]:
# сорри за эти кишки, без этой функции моделька не грузится
stop_words = stopwords.words("russian")

def tokenize(text):
    if isinstance(text, np.ndarray):
      # np array
      str_text = ''
      for s in text:
        str_text = str_text + s
      text = str_text
    

    tokens = [token for token in text.split() if token not in stop_words and token != " " \
                      and token.strip() not in punctuation]
    return tokens


In [None]:
with open(path_to_class_model,'rb') as g:
  class_model = joblib.load(g) # валится без tokenize


In [None]:
acc = 0.86512297 #пока так, в общем случае этот метод класса

In [None]:
def get_class(resp_ind):
  return classes.iloc[resp_ind]

In [None]:
# принимает запрос текстом и индексы резов
def ev_heurist(query, responses):
  res_size = responses.shape[0] # количество доков
  grades = np.arange(1, res_size + 1)[::-1]
  
  max_grade = res_size * (res_size + 1) / 2
  q_class_pred = class_model.predict(query)

  resp_class = get_class(responses) # get class method

  is_eq = (resp_class == q_class_pred * np.ones(res_size))


  score = grades[is_eq].sum() / max_grade

  return min(1, score * (1. / acc))


In [None]:
# принимает запрос текстом и индексы резов
ev_heurist(rand_query, indeces)

0.6109783775264094