In [4]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from scipy.sparse import hstack, vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from pymorphy2 import MorphAnalyzer
# from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.preprocessing import StandardScaler
import re, pickle
# from sklearn.svm import SVR
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
punct = set(punctuation)
import scipy
morph = MorphAnalyzer()
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [5]:
def normalize(text):
    tokens = [x for x in tokenizer(text.lower()) if not set(x).issubset(punct)]
    norm = [morph.parse(token)[0].normal_form for token in tokens]
    return ' '.join(norm)

def normalize_pos(text):
    tokens = [x for x in tokenizer(text.lower()) if not set(x).issubset(punct)]
    norm = filter(bool, [morph.parse(token)[0].tag.POS for token in tokens])
    return ' '.join(norm)

def sum_of_idfs(text, idfs):
    tokens = [x for x in tokenizer(text.lower())]
    return np.log10(sum([idfs.get(token, 8) for token in tokens])+1)

def week(text):
    y,m,d = [int(x) for x in text.split('-')]
    date = datetime(y,m,d)
    return date.weekday()

In [6]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [7]:
params = train[['param_1', 'param_2', 'param_3']].fillna('nan').apply(lambda x: 
                                                        ' '.join([p for p in x]), axis=1)

descriptions = train.description.fillna('nan')
title = train.title

In [None]:
per_upper_descr = descriptions.apply(lambda x: len([j for j in x if j.isupper()])/len(x))
per_digit_descr = descriptions.apply(lambda x: len([j for j in x if j.isdigit()])/len(x))
per_excl_descr = descriptions.apply(lambda x: 
                                                len([j for j in x if j == '!'])/len(x))
per_punct_descr = descriptions.apply(lambda x: 
                                                len([j for j in x if not j.isalnum() and j != ' '])/len(x))

per_upper_title = title.apply(lambda x: len([j for j in x if j.isupper()])/len(x))
per_digit_title = title.apply(lambda x: len([j for j in x if j.isdigit()])/len(x))
per_punct_title = title.apply(lambda x: 
                                                len([j for j in x if not j.isalnum() and j != ' '])/len(x))
per_excl_title = title.apply(lambda x: 
                                                len([j for j in x if j == '!'])/len(x))




# mean_t = np.mean(train.title.apply(len))
# mean_d = np.mean(train.description.fillna('').apply(len))
# pd.concat([train.image_top_1.fillna(0), test.image_top_1.fillna(0)])

In [6]:
per_upper_descr_test = test.description.fillna(' ').apply(lambda x: len([j for j in x if j.isupper()])/len(x))
per_digit_descr_test = test.description.fillna(' ').apply(lambda x: len([j for j in x if j.isdigit()])/len(x))
per_excl_descr_test = test.description.fillna(' ').apply(lambda x: 
                                                len([j for j in x if j == '!'])/len(x))
per_punct_descr_test = test.description.fillna(' ').apply(lambda x: 
                                                len([j for j in x if not j.isalnum() and j != ' '])/len(x))

per_upper_title_test = test.title.fillna(' ').apply(lambda x: len([j for j in x if j.isupper()])/len(x))
per_digit_title_test = test.title.fillna(' ').apply(lambda x: len([j for j in x if j.isdigit()])/len(x))
per_punct_title_test = test.title.fillna(' ').apply(lambda x: 
                                                len([j for j in x if not j.isalnum() and j != ' '])/len(x))
per_excl_title_test = test.title.fillna(' ').apply(lambda x: 
                                                len([j for j in x if j == '!'])/len(x))

# mean_t = np.mean(train.title.apply(len))
# mean_d = np.mean(train.description.fillna('').apply(len))
# pd.concat([train.image_top_1.fillna(0), test.image_top_1.fillna(0)])

In [16]:
length_descr_train = descriptions.apply(lambda x: np.log10(len(x)+1)).values.reshape(-1,1)
length_title_train = title.apply(lambda x: np.log10(len(x)+1)).values.reshape(-1,1)

In [17]:
length_descr_test = test.description.fillna(' ').apply(lambda x: np.log10(len(x)+1)).values.reshape(-1,1)
length_title_test = test.title.fillna(' ').apply(lambda x: np.log10(len(x)+1)).values.reshape(-1,1)

In [18]:
length_descr_tokens_train = descriptions.apply(lambda x: np.log10(len(tokenizer(x))+1))
length_title_tokens_train = title.apply(lambda x: np.log10(len(tokenizer(x))+1))

In [90]:
length_descr_tokens_test = descriptions_test.apply(lambda x: np.log10(len(tokenizer(x))+1))
length_title_tokens_test = title_test.apply(lambda x: np.log10(len(tokenizer(x))+1))

In [20]:
length_descr_uniq_tokens_train = descriptions.apply(lambda x: np.log10(len(set(tokenizer(x)))+1))
length_title_uniq_tokens_train = title.apply(lambda x: np.log10(len(set(tokenizer(x)))+1))

In [91]:
length_descr_uniq_tokens_test = descriptions_test.apply(lambda x: np.log10(len(set(tokenizer(x)))+1))
length_title_uniq_tokens_test = title_test.apply(lambda x: np.log10(len(set(tokenizer(x)))+1))

In [None]:
# prices_mentioned = train.description.fillna(' ').apply(find_prices).values.reshape(-1,1)
# prices = train.price.fillna().apply(lambda x: np.log10(x+1)).values.reshape(-1,1)

In [None]:
# prices_mentioned_test = test.description.fillna(' ').apply(find_prices).values.reshape(-1,1)
# prices_test = test.price.fillna(1300).apply(lambda x: np.log10(x+1)).values.reshape(-1,1)

In [22]:
prices = train.price.apply(np.log10).fillna(0)
prices_test = test.price.apply(np.log10).fillna(0)

In [94]:
# length_param_train = params.apply(len).apply(np.log10).fillna(0)
length_param_test = params_test.apply(len).apply(np.log10).fillna(0)

# length_param_test = test[['param_1', 'param_2', 'param_3']].fillna('').apply(lambda x: 
#                                                         ' '.join([p for p in x]), axis=1).apply(len).apply(np.log10)

In [None]:
X_city_stat = np.load('matrix/X_city_stat.npy', )
X_city_stat_price = np.load('matrix/X_city_stat_price.npy', )
X_category_name_stat = np.load('matrix/X_category_name_stat.npy', )
X_category_name_stat_price = np.load('matrix/X_category_name_stat_price.npy', )
X_param_1_stat = np.load('matrix/X_param_1_stat.npy', )
X_param_1_stat_price = np.load('matrix/X_param_1_stat_price.npy', )
X_param_2_stat = np.load('matrix/X_param_2_stat.npy', )
X_param_2_stat_price = np.load('matrix/X_param_2_stat_price.npy', )
X_param_3_stat = np.load('matrix/X_param_3_stat.npy', )
X_param_3_stat_price = np.load('matrix/X_param_3_stat_price.npy', )

In [56]:
X_city_stat_test = np.load('matrix/X_city_stat_test.npy', )
X_city_stat_price_test = np.load('matrix/X_city_stat_price_test.npy', )
X_category_name_stat_test = np.load('matrix/X_category_name_stat_test.npy', )
X_category_name_stat_price_test = np.load('matrix/X_category_name_stat_price_test.npy', )
X_param_1_stat_test = np.load('matrix/X_param_1_stat_test.npy', )
X_param_1_stat_price_test = np.load('matrix/X_param_1_stat_price_test.npy', )
X_param_2_stat_test = np.load('matrix/X_param_2_stat_test.npy', )
X_param_2_stat_price_test = np.load('matrix/X_param_2_stat_price_test.npy', )
X_param_3_stat_test = np.load('matrix/X_param_3_stat_test.npy', )
X_param_3_stat_price_test = np.load('matrix/X_param_3_stat_price_test.npy', )

In [8]:
# ТОКЕНИЗАЦИЯ
stops = stopwords.words('russian')
rx = re.compile('\d+\,\d+|[\w]+(?:[\w\-/]+[\w])?')
rexc = re.compile('!!+')
rtir = re.compile('--+')
tokenizer = lambda x: [correct.get(m.lower().replace('ё','е'), 
                                  m.lower().replace('ё','е')) 
                       for string in re.sub('\n', ' ', x).split() 
            for m in rx.findall(re.sub('[_\xa0\t]', ' ', string))]

In [9]:
correct = {}
for line in open('corrections.txt'):
    line = line.rstrip('\n')
    mistake, cor = line.split('\t')
    correct[mistake] = cor

In [10]:
# ПАРАМЕТРЫ, ОПИСАНИЕ, ЗАГОЛОВОК

cv_params = CountVectorizer(max_features=10000, ngram_range=(1,2))

tf_descr = TfidfVectorizer(max_features=70000, ngram_range=(1,2),
                           tokenizer=tokenizer, sublinear_tf=True, min_df=5)
tf_title = TfidfVectorizer(max_features=20000, ngram_range=(1,2),
                                 tokenizer=tokenizer, sublinear_tf=True, min_df=5, )
# tf_descr.fit(descriptions)



In [116]:
del X_test

In [114]:
X_params_train = cv_params.fit_transform(params)
X_descr_train = tf_descr.fit_transform(descriptions)
X_title_train = tf_title.fit_transform(title)

In [11]:
tf_title_char = TfidfVectorizer(max_features=2000, ngram_range=(1,3), 
                                analyzer='char', lowercase=True, min_df=500, 
                                sublinear_tf=False)
X_title_char_train = tf_title_char.fit_transform(title)


In [92]:
X_title_char_test = tf_title_char.transform(title_test)

In [119]:
idfs_descr = {k:v for k,v in zip(tf_descr.get_feature_names(), tf_descr.idf_)}
idfs_title = {k:v for k,v in zip(tf_title.get_feature_names(), tf_title.idf_)}
# idfs_param = {k:v for k,v in zip(cv_params.get_feature_names(), cv_params.idf_)}

idfs_sum_title = descriptions.apply(lambda x: sum_of_idfs(x, idfs_title))
idfs_sum_descr = title.fillna(' ').apply(lambda x: sum_of_idfs(x, idfs_descr))
# idfs_sum_param = params.apply(lambda x: sum_of_idfs(x, idfs_param))

In [96]:

idfs_sum_title_test = descriptions_test.apply(lambda x: sum_of_idfs(x, idfs_title))
idfs_sum_descr_test = title_test.fillna(' ').apply(lambda x: sum_of_idfs(x, idfs_descr))
# idfs_sum_param = params.apply(lambda x: sum_of_idfs(x, idfs_param))

In [19]:
pos_params = params.apply(normalize_pos)
pos_descriptions = descriptions.apply(normalize_pos)
pos_title = title.apply(normalize_pos)

In [20]:
tf_params_pos = TfidfVectorizer(min_df=100, ngram_range=(1,2))
tf_title_pos = TfidfVectorizer(min_df=100, ngram_range=(1,2))
tf_descr_pos = TfidfVectorizer(min_df=100, ngram_range=(1,2))

X_pos_params_train = tf_params_pos.fit_transform(pos_params)
X_pos_descriptions_train = tf_descr_pos.fit_transform(pos_descriptions)
X_pos_title_train = tf_title_pos.fit_transform(pos_title)

In [21]:
norm_params = params.apply(normalize)
norm_descriptions = descriptions.apply(normalize)
norm_title = title.apply(normalize)

In [22]:
# tf_params_norm = TfidfVectorizer(min_df=100, ngram_range=(1,2))
tf_title_norm = TfidfVectorizer(min_df=100, max_features=5000)
tf_descr_norm = TfidfVectorizer(min_df=100, max_features=5000)

# X_norm_params_train = tf_params_norm.fit_transform(pos_params)
X_norm_descriptions_train = tf_descr_norm.fit_transform(norm_descriptions)
X_norm_title_train = tf_title_norm.fit_transform(norm_title)

In [26]:
# ITEM SEQ NUMBER as NUMBER

X_item_seq_number_n_train = train.item_seq_number.apply(np.log10).values.reshape(-1, 1)

In [27]:
# ITEM SEQ NUMBER as NUMBER

X_item_seq_number_n_test = test.item_seq_number.apply(np.log10).values.reshape(-1, 1)

In [120]:
stats = np.concatenate([length_descr_train,
                        length_title_train,
                        length_param_train.values.reshape(-1,1),
                        prices.values.reshape(-1,1),
                        length_descr_uniq_tokens_train.values.reshape(-1,1),
                        length_title_uniq_tokens_train.values.reshape(-1,1),
                        length_param_train.values.reshape(-1,1),
                        per_digit_descr.values.reshape(-1,1),
                        per_digit_title.values.reshape(-1,1),
                        per_excl_descr.values.reshape(-1,1),
                        per_excl_title.values.reshape(-1,1),
                        per_punct_descr.values.reshape(-1,1),
                        per_punct_title.values.reshape(-1,1),
                        per_upper_descr.values.reshape(-1,1),
                        per_upper_title.values.reshape(-1,1),
                        X_item_seq_number_n_train,
                        length_descr_tokens_train.values.reshape(-1,1),
                        length_title_tokens_train.values.reshape(-1,1),
                        idfs_sum_title.values.reshape(-1,1),
                        idfs_sum_descr.values.reshape(-1,1),
#                         X_city_stat.reshape(-1, 1),
#                         X_city_stat_price.reshape(-1, 1),
#                         X_category_name_stat.reshape(-1, 1),
#                         X_category_name_stat_price.reshape(-1, 1),
#                         X_param_1_stat.reshape(-1, 1),
#                         X_param_1_stat_price.reshape(-1, 1),
#                         X_param_2_stat.reshape(-1, 1),
#                         X_param_2_stat_price.reshape(-1, 1),
#                         X_param_3_stat.reshape(-1, 1),
#                         X_param_3_stat_price.reshape(-1, 1)
                        ], axis=1)



In [97]:
stats_test = np.concatenate([length_descr_test,
                        length_title_test,
                        length_param_test.values.reshape(-1,1),
                        prices_test.values.reshape(-1,1),
                        length_descr_uniq_tokens_test.values.reshape(-1,1),
                        length_title_uniq_tokens_test.values.reshape(-1,1),
                        length_param_test.values.reshape(-1,1),
                        per_digit_descr_test.values.reshape(-1,1),
                        per_digit_title_test.values.reshape(-1,1),
                        per_excl_descr_test.values.reshape(-1,1),
                        per_excl_title_test.values.reshape(-1,1),
                        per_punct_descr_test.values.reshape(-1,1),
                        per_punct_title_test.values.reshape(-1,1),
                        per_upper_descr_test.values.reshape(-1,1),
                        per_upper_title_test.values.reshape(-1,1),
                        X_item_seq_number_n_test,
                        length_descr_tokens_test.values.reshape(-1,1),
                        length_title_tokens_test.values.reshape(-1,1),
                        idfs_sum_title_test.values.reshape(-1,1),
                        idfs_sum_descr_test.values.reshape(-1,1),
#                         X_city_stat_test.reshape(-1, 1),
#                         X_city_stat_price_test.reshape(-1, 1),
#                         X_category_name_stat_test.reshape(-1, 1),
#                         X_category_name_stat_price_test.reshape(-1, 1),
#                         X_param_1_stat_test.reshape(-1, 1),
#                         X_param_1_stat_price_test.reshape(-1, 1),
#                         X_param_2_stat_test.reshape(-1, 1),
#                         X_param_2_stat_price_test.reshape(-1, 1),
#                         X_param_3_stat_test.reshape(-1, 1),
#                         X_param_3_stat_price_test.reshape(-1, 1)
                        ], axis=1)



In [47]:
from sklearn.preprocessing import PolynomialFeatures
pl = PolynomialFeatures(3, interaction_only=True)

In [121]:
stats[np.isinf(stats)] = -99

In [107]:
stats_test[np.isinf(stats_test)] = 0

In [None]:
stats_inter = pl.fit_transform(stats)

In [124]:
stats_inter.shape

(1503424, 496)

In [88]:
# ОПИСАНИЕ, ПАРАМЕТРЫ, ЗАГОЛОВОК ДЛЯ ТЕСТА

params_test = test[['param_1', 'param_2', 'param_3']].fillna('nan').apply(lambda x: 
                                                        ' '.join([p for p in x]), axis=1)

descriptions_test = test.description.fillna('nan')
title_test = test.title

In [89]:
X_params_test = cv_params.transform(params_test)

X_descr_test = tf_descr.transform(descriptions_test)

X_title_test = tf_title.transform(title_test)

In [29]:
# PARAMS CAT

lenc_param_1 = LabelEncoder()
int_enc = lenc_param_1.fit_transform(pd.concat([train.param_1.fillna(''), 
                                        test.param_1.fillna('') ]))
onehot_param_1 = OneHotEncoder(sparse=True)
int_param_1 = int_enc.reshape(len(int_enc), 1)

onehot_param_1.fit(int_param_1)

int_enc_train = lenc_param_1.transform(train.param_1.fillna(''))
X_param_1_train = onehot_param_1.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_param_1.transform(test.param_1.fillna(''))
X_param_1_test = onehot_param_1.transform(int_enc_test.reshape(len(int_enc_test), 1))

# PARAMS CAT

lenc_param_2 = LabelEncoder()
int_enc = lenc_param_2.fit_transform(pd.concat([train.param_2.fillna(''), 
                                        test.param_2.fillna('') ]))
onehot_param_2 = OneHotEncoder(sparse=True)
int_param_2 = int_enc.reshape(len(int_enc), 1)

onehot_param_2.fit(int_param_2)

int_enc_train = lenc_param_2.transform(train.param_2.fillna(''))
X_param_2_train = onehot_param_2.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_param_2.transform(test.param_2.fillna(''))
X_param_2_test = onehot_param_2.transform(int_enc_test.reshape(len(int_enc_test), 1))

# PARAMS CAT

lenc_param_3 = LabelEncoder()
int_enc = lenc_param_3.fit_transform(pd.concat([train.param_3.fillna(''), 
                                        test.param_3.fillna('') ]))
onehot_param_3 = OneHotEncoder(sparse=True)
int_param_3 = int_enc.reshape(len(int_enc), 1)

onehot_param_3.fit(int_param_3)

int_enc_train = lenc_param_3.transform(train.param_3.fillna(''))
X_param_3_train = onehot_param_3.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_param_3.transform(test.param_3.fillna(''))
X_param_3_test = onehot_param_3.transform(int_enc_test.reshape(len(int_enc_test), 1))



In [30]:
# # IMAGE TOP 1

lenc = LabelEncoder()
int_enc = lenc.fit_transform(pd.concat([train.image_top_1.fillna(0), 
                                        test.image_top_1.fillna(0)]))
onehot = OneHotEncoder(sparse=True)
int_top1 = int_enc.reshape(len(int_enc), 1)

onehot.fit(int_top1)

int_enc_train = lenc.transform(train.image_top_1.fillna(0))
X_top1_train = onehot.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc.transform(test.image_top_1.fillna(0))
X_top1_test = onehot.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [31]:
lenc_user_id.classes_

NameError: name 'lenc_user_id' is not defined

In [38]:
ids_set = set(train.user_id) & set(test.user_id)

In [40]:
len(ids_set)

67929

In [130]:
freqs = Counter(test.user_id) + Counter(train.user_id) 

In [136]:
vocab = set(lenc_user_id.classes_)

In [139]:
# def shrink(userid):
#     if userid in vocab:
        
user_id = train.user_id.apply(lambda x: str(freqs.get(x, 'shit')) if x not in ids_set else x)
user_id_test = test.user_id.apply(lambda x: str(freqs.get(x, 'shit')) if x not in ids_set else x)

In [146]:
conc = pd.concat([user_id, user_id_test], axis=0)

In [149]:
# USER ID

lenc_user_id = LabelEncoder()
int_enc = lenc_user_id.fit_transform(conc)
onehot_user_id = OneHotEncoder(sparse=True)
int_user_id = int_enc.reshape(len(int_enc), 1)

onehot_user_id.fit(int_user_id)

int_enc_train = lenc_user_id.transform(user_id)
X_user_id_train = onehot_user_id.transform(int_enc_train.reshape(len(int_enc_train), 1))


int_enc_test = lenc_user_id.transform(user_id_test)
X_user_id_test = onehot_user_id.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [None]:
X_user_id_train

In [98]:
# ITEM SEQ NUMBER

# lenc_item_seq_number = LabelEncoder()
# int_enc = lenc_item_seq_number.fit_transform(pd.concat([train.item_seq_number.fillna(0), 
#                                                test.item_seq_number.fillna(0)]))
# onehot_item_seq_number = OneHotEncoder(sparse=True)
# int_item_seq_number = int_enc.reshape(len(int_enc), 1)

# onehot_item_seq_number.fit(int_item_seq_number)

# int_enc_train = lenc_item_seq_number.transform(train.item_seq_number.fillna(0))
# X_item_seq_number_train = onehot_item_seq_number.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_item_seq_number.transform(test.item_seq_number.fillna(0))
X_item_seq_number_test = onehot_item_seq_number.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [43]:
# # USER TYPE
lenc_user_type = LabelEncoder()
int_enc = lenc_user_type.fit_transform(pd.concat([train.user_type.fillna(0), 
                                               test.user_type.fillna(0)]))
onehot_user_type = OneHotEncoder(sparse=True)
int_user_type = int_enc.reshape(len(int_enc), 1)

onehot_user_type.fit(int_user_type)

int_enc_train = lenc_user_type.transform(train.user_type.fillna(0))
X_user_type_train = onehot_user_type.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_user_type.transform(test.user_type.fillna(0))
X_user_type_test = onehot_user_type.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [44]:
# CATEGORY NAME

lenc_category_name = LabelEncoder()
int_enc = lenc_category_name.fit_transform(pd.concat([train.category_name.fillna(0), 
                                               test.category_name.fillna(0)]))
onehot_category_name = OneHotEncoder(sparse=True)
int_category_name = int_enc.reshape(len(int_enc), 1)

onehot_category_name.fit(int_category_name)

int_enc_train = lenc_category_name.transform(train.category_name.fillna(0))
X_category_name_train = onehot_category_name.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_category_name.transform(test.category_name.fillna(0))
X_category_name_test = onehot_category_name.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [45]:
# PARENT CATEGORY NAME

lenc_parent_category_name = LabelEncoder()
int_enc = lenc_parent_category_name.fit_transform(pd.concat([train.parent_category_name.fillna(0), 
                                               test.parent_category_name.fillna(0)]))
onehot_parent_category_name = OneHotEncoder(sparse=True)
int_parent_category_name = int_enc.reshape(len(int_enc), 1)

onehot_parent_category_name.fit(int_parent_category_name)

int_enc_train = lenc_parent_category_name.transform(train.parent_category_name.fillna(0))
X_parent_category_name_train = onehot_parent_category_name.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_parent_category_name.transform(test.parent_category_name.fillna(0))
X_parent_category_name_test = onehot_parent_category_name.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [46]:
# CITY

lenc_city = LabelEncoder()
int_enc = lenc_city.fit_transform(pd.concat([train.city, 
                                               test.city]))
onehot_city = OneHotEncoder(sparse=True)
int_city = int_enc.reshape(len(int_enc), 1)

onehot_city.fit(int_city)

int_enc_train = lenc_city.transform(train.city)
X_city_train = onehot_city.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_city.transform(test.city)
X_city_test = onehot_city.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [47]:
# REGION

lenc_region = LabelEncoder()
int_enc = lenc_region.fit_transform(pd.concat([train.region, 
                                               test.region]))
onehot_region = OneHotEncoder(sparse=True)
int_region = int_enc.reshape(len(int_enc), 1)

onehot_region.fit(int_region)

int_enc_train = lenc_region.transform(train.region)
X_region_train = onehot_region.transform(int_enc_train.reshape(len(int_enc_train), 1))

int_enc_test = lenc_region.transform(test.region)
X_region_test = onehot_region.transform(int_enc_test.reshape(len(int_enc_test), 1))

In [48]:
# Fasttext cluster

ft_cluster_train = pickle.load(open('ft_labels.pkl', 'rb'))
ft_cluster_test = pickle.load(open('ft_labels_test.pkl', 'rb'))

onehot_ft_cluster = OneHotEncoder(sparse=True)
int_ft_cluster_train = ft_cluster_train.reshape(len(ft_cluster_train), 1)
int_ft_cluster_test = ft_cluster_test.reshape(len(ft_cluster_test), 1)
# int_week_test = week_test.reshape(len(week_test), 1)

X_ft_cluster_train = onehot_ft_cluster.fit_transform(int_ft_cluster_train)
X_ft_cluster_test = onehot_ft_cluster.transform(int_ft_cluster_test)
# X_week_test = onehot_week.transform(int_week_test)



In [49]:
# WEEK

week_train = train.activation_date.apply(week)
week_test = test.activation_date.apply(week)

onehot_week = OneHotEncoder(sparse=True)
int_week_train = week_train.reshape(len(week_train), 1)
int_week_test = week_test.reshape(len(week_test), 1)

X_week_train = onehot_week.fit_transform(int_week_train)
X_week_test = onehot_week.transform(int_week_test)



In [50]:

has_price_train = train.price.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_img_train = train.image.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

has_price_test = test.price.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_img_test = test.image.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

In [51]:
has_descr_train = train.description.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_param_1_train = train.param_1.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

has_param_2_train = train.param_2.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_param_3_train = train.param_3.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

In [52]:
has_descr_test = test.description.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_param_1_test = test.param_1.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

has_param_2_test = test.param_2.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)
has_param_3_test = test.param_3.isna().apply(lambda x: 0 if x else 1).values.reshape(-1,1)

In [151]:
del X

In [153]:
# ОБУЧАЮЩАЯ ВЫБОРКА

X = hstack([X_params_train, X_descr_train, X_title_train,
#             X_norm_descriptions_train, X_norm_title_train,
#             X_pos_descriptions_train, X_pos_title_train, 
#             X_pos_params_train,
            X_user_type_train, X_user_id_train, X_top1_train,
            X_category_name_train, X_parent_category_name_train,
            X_city_train, X_region_train,
            X_week_train, has_price_train, has_img_train, has_descr_train,
            has_param_1_train,
            has_param_2_train,
            has_param_3_train,
            X_ft_cluster_train,
            X_param_1_train,
            X_param_2_train,
            X_param_3_train,
            X_title_char_train,
            X_item_seq_number_train,
#             X_city_stat.reshape(-1,1),
#             X_city_stat_price.reshape(-1,1),
#             X_category_name_stat.reshape(-1,1),
#             X_category_name_stat_price.reshape(-1,1),
#             X_param_1_stat.reshape(-1,1),
#             X_param_1_stat_price.reshape(-1,1),
#             X_param_2_stat.reshape(-1,1),
#             X_param_2_stat_price.reshape(-1,1),
#             X_param_3_stat.reshape(-1,1),
#             X_param_3_stat_price.reshape(-1,1),
#             X_params_char_train,
#             X_title_char_train,
            stats
#             X_descr_char_train
#             np.log10(user_mean_delay+1),
#             np.log10(user_placed_ads+1),
            
            ])


In [108]:
# ОБУЧАЮЩАЯ ВЫБОРКА

X_test = hstack([X_params_test, X_descr_test, X_title_test,
#             X_norm_descriptions_train, X_norm_title_train,
#             X_pos_descriptions_train, X_pos_title_train, 
#             X_pos_params_train,
            X_user_type_test, X_user_id_test, X_top1_test,
            X_category_name_test, X_parent_category_name_test,
            X_city_test, X_region_test,
            X_week_test, has_price_test, has_img_test, has_descr_test,
            has_param_1_test,
            has_param_2_test,
            has_param_3_test,
            X_ft_cluster_test,
            X_param_1_test,
            X_param_2_test,
            X_param_3_test,
             X_title_char_test,
            X_item_seq_number_test,
#             X_city_stat.reshape(-1,1),
#             X_city_stat_price.reshape(-1,1),
#             X_category_name_stat.reshape(-1,1),
#             X_category_name_stat_price.reshape(-1,1),
#             X_param_1_stat.reshape(-1,1),
#             X_param_1_stat_price.reshape(-1,1),
#             X_param_2_stat.reshape(-1,1),
#             X_param_2_stat_price.reshape(-1,1),
#             X_param_3_stat.reshape(-1,1),
#             X_param_3_stat_price.reshape(-1,1),
#             X_params_char_train,
#             X_title_char_train,
            stats_test
#             X_descr_char_train
#             np.log10(user_mean_delay+1),
#             np.log10(user_placed_ads+1),
            
            ])


In [60]:
y = train.deal_probability.values

In [118]:
del X_test

NameError: name 'X_test' is not defined

In [156]:
del X
import gc
gc.collect()

0

In [129]:
X.shape

(1503424, 207827)

In [155]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [157]:
regr = Ridge(alpha=10, solver='sag', fit_intercept=False)
regr.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='sag', tol=0.001)

In [158]:
y_pred = regr.predict(X_valid)
y_preds = []
for j in y_pred:
    if j < 0:
        j = 0
    elif j > 1:
        j = 0.9
    y_preds.append(j)

print(np.sqrt(mean_squared_error(y_valid, y_preds)))

0.2251409026823776


In [43]:
from xgboost.sklearn import XGBRegressor  
import xgboost as xgb
import scipy.stats as st
from sklearn.cross_validation import train_test_split

In [None]:
dtrain = xgb.DMatrix(X, label=y)

In [113]:
params = {  
    "n_estimators": 10000,
    "max_depth": 10,
    "learning_rate": 0.3,
    "nthreads":3,
    "lambda":10,
    "colsample_bytree":0.9,
    "subsample":0.9,
    "objective":'reg:linear',
#     'booster':'gblinear'
#     "colsample_bytree": one_to_left,
#     "subsample": one_to_left,
#     "gamma": st.uniform(0, 10),
#     'reg_alpha': from_zero_positive,
#     "min_child_weight": from_zero_positive,
}

In [114]:
evallist = [(dtrain, 'train')]

In [115]:
num_round = 100
bst = xgb.train(params, dtrain, num_round, evallist)

[0]	train-rmse:0.353175
[1]	train-rmse:0.297997
[2]	train-rmse:0.265929
[3]	train-rmse:0.248351
[4]	train-rmse:0.238402
[5]	train-rmse:0.23329
[6]	train-rmse:0.230112
[7]	train-rmse:0.228307
[8]	train-rmse:0.226885
[9]	train-rmse:0.226282
[10]	train-rmse:0.225757
[11]	train-rmse:0.225372
[12]	train-rmse:0.224937
[13]	train-rmse:0.22436
[14]	train-rmse:0.224185
[15]	train-rmse:0.223909
[16]	train-rmse:0.223646
[17]	train-rmse:0.223451
[18]	train-rmse:0.223416
[19]	train-rmse:0.223224
[20]	train-rmse:0.223224
[21]	train-rmse:0.223031
[22]	train-rmse:0.222618
[23]	train-rmse:0.222618
[24]	train-rmse:0.222618
[25]	train-rmse:0.222565
[26]	train-rmse:0.222565
[27]	train-rmse:0.222565
[28]	train-rmse:0.222404
[29]	train-rmse:0.222404
[30]	train-rmse:0.222404
[31]	train-rmse:0.221899
[32]	train-rmse:0.221899
[33]	train-rmse:0.221899
[34]	train-rmse:0.221899
[35]	train-rmse:0.221831


KeyboardInterrupt: 

In [89]:
bst.save_model('xgb')

In [90]:
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('xgb')

In [91]:
num_round = 100
bst = xgb.train(params, dtrain, num_round, evallist,  xgb_model=bst)

[0]	train-rmse:0.218404
[1]	train-rmse:0.218404
[2]	train-rmse:0.218349
[3]	train-rmse:0.218349
[4]	train-rmse:0.218298
[5]	train-rmse:0.218298
[6]	train-rmse:0.218295
[7]	train-rmse:0.218258
[8]	train-rmse:0.218258
[9]	train-rmse:0.218258
[10]	train-rmse:0.218258
[11]	train-rmse:0.218224
[12]	train-rmse:0.218215
[13]	train-rmse:0.218215
[14]	train-rmse:0.218215
[15]	train-rmse:0.218215
[16]	train-rmse:0.218176
[17]	train-rmse:0.218134
[18]	train-rmse:0.218075
[19]	train-rmse:0.218037
[20]	train-rmse:0.218037
[21]	train-rmse:0.21803
[22]	train-rmse:0.218027
[23]	train-rmse:0.217984
[24]	train-rmse:0.217984
[25]	train-rmse:0.217984
[26]	train-rmse:0.217984
[27]	train-rmse:0.217984
[28]	train-rmse:0.217866
[29]	train-rmse:0.217866
[30]	train-rmse:0.217866
[31]	train-rmse:0.217795
[32]	train-rmse:0.217747
[33]	train-rmse:0.217692
[34]	train-rmse:0.217682
[35]	train-rmse:0.21765
[36]	train-rmse:0.217638
[37]	train-rmse:0.217638
[38]	train-rmse:0.217605
[39]	train-rmse:0.2176
[40]	train-rms

In [83]:
dtest = xgb.DMatrix(X_test)

In [92]:
y_pred = bst.predict(dtest)

In [109]:
y_pred = regr.predict(X_test)

In [110]:
y_pred

array([0.17934903, 0.22425888, 0.1936817 , ..., 0.04652682, 0.35061284,
       0.10118181])

In [111]:
f = open('submission_41.txt', 'w')
f.write('item_id,deal_probability\n')
for i, proba in list(zip(test.item_id.values, y_pred)):
    if proba < 0:
        proba = 0
    elif proba > 1:
        proba = 0.9
    f.write(','.join([i, str(proba)]) + '\n')
f.close()

In [86]:
test.item_id.values

array(['6544e41a8817', '65b9484d670f', '8bab230b2ecd', ...,
       'a22a2eeb5dd2', 'ed7fbb0733c1', 'd374d332992f'], dtype=object)

In [88]:
y_pred

array([0.14005741, 0.2369861 , 0.18089867, ..., 0.03736338, 0.33610606,
       0.11328503], dtype=float32)

In [93]:
y_pred

array([0.13847348, 0.24325353, 0.19394764, ..., 0.03695092, 0.33129305,
       0.11287257], dtype=float32)

In [None]:
r