In [1]:
import re
from langdetect import detect
import cPickle
import numpy as np

In [3]:
def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return cPickle.load(f)

def save_obj(obj, name ):
    with open('obj/' + name + '.pkl', 'wb') as f:
        cPickle.dump(obj, f)#,  marshal.version)#, marshal.HIGHEST_PROTOCOL)

In [4]:
from langdetect import detect

class FeatureGenerator:
    
    def __init__(self, lm):
        self.lm = lm
    
    def generate_features(self, query):
        x = []
        words = re.findall(ur"(?u)\w+", query)
        x.append(len(words))# количество слов
        x.append(len(query))# количество символов
        x.append(self.lm.get_prob(words)) # вероятность такого запроса
        max_prob = -1.
        min_prob = 2.
        
        count_of_words_in_dict = 0
        for word in words:
            prob = self.lm.get_word_prob(word)
            if prob > max_prob:
                max_prob = prob
            if prob < min_prob:
                min_prob = prob
                
            if self.lm.dict.has_key(word):
                count_of_words_in_dict += 1
                
        x.append(max_prob) # максимальная вероятность слова
        x.append(min_prob) # минимальная вероятность слова
        x.append(len(words)-count_of_words_in_dict) # сколько слов нет в словаре
        
        if u"," in query or \
            u"." in query or \
            u"'" in query or \
            u";" in query or \
            u"]" in query or \
            u"[" in query or \
            u"~" in query:
            x.append(1) # есть ли "плохие" символы в запросе 
        else:
            x.append(0)
        
        try:
            lang = detect(query)
            lang = 1 if lang == 'en' else 0
        except Exception:
            lang = 0
            
        x.append(lang) # язык
        
        
        return x

In [5]:
lm = load_obj("LanguageModel")

In [6]:
with open("data/right_queries.txt") as f:
    content = f.readlines()
    
queries = content[0].split("<br><br>")

correct_q = queries[:1000]

bad_q = queries[1002:]
need_fix = bad_q[:1000]
need_split = bad_q[1002:2002]
need_join = bad_q[2004:3004]

bad_q = need_fix
bad_q.extend(need_split)
bad_q.extend(need_join)

In [35]:
from random import random

X = []
y = []
fg = FeatureGenerator(lm)
i=0
with open("data/queries_all.txt") as f:
    content = f.readlines()

for line in content:
    if random() < 0.2:
        i += 1
    else:
        continue
        
    if i > 20000:
        break
        
    line = line.decode("utf-8")
    line = line.lower()
    line = line[:-1]
    queries = line.split('\t')

    if len(queries) == 2:
        y.append(0)
        X.append(fg.generate_features(queries[0]))
        
        y.append(1)
        X.append(fg.generate_features(queries[1]))
        
    else:
        y.append(1)
        X.append(fg.generate_features(queries[0]))

In [55]:
X = []
y = []

for q in correct_q:
    q = q.decode("utf-8")
    q = q.lower()
    y.append(1)
    X.append(fg.generate_features(q))
    
for q in bad_q:
    q = q.decode("utf-8")
    q = q.lower()
    y.append(0)
    X.append(fg.generate_features(q))

In [56]:
len(y), len(X)

(4000, 4000)

In [57]:
X = np.asarray(X)
y = np.asarray(y)

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import f1_score, accuracy_score

In [59]:
f1 = []
acc = []
kf = KFold(len(y), n_folds=4, shuffle=True)
for train_index, test_index in kf:
    y_train, y_test = y[train_index], y[test_index]
    X_train, X_test = X[train_index], X[test_index]
    
    gb = GradientBoostingClassifier(n_estimators=200, max_depth=None, loss='exponential')#, max_features=5, learning_rate=0.04)
    gb.fit(X_train, y_train)
    
    y_predicted = gb.predict(X_test)
    f1.append(f1_score(y_test, y_predicted, pos_label=0))
    acc.append(accuracy_score(y_test, y_predicted))
    
print sum(f1)/len(f1)
print sum(acc)/len(acc)

0.890758156029
0.83475


In [62]:
gb = GradientBoostingClassifier(n_estimators=20, max_depth=None, loss='exponential')
gb.fit(X, y)
y_pred = gb.predict(X)
f1_score(y, y_pred, pos_label=0)

0.99634551495016621

In [63]:
save_obj(gb, "classifier_input")