In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import pipeline,preprocessing,feature_extraction,metrics



In [2]:
train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

In [28]:
def lump_levels2(x, threshold_percentage, new_value):
    threshold = threshold_percentage * x.count()
    
    x = x.copy()
    value_counts = x.value_counts()
    label_with_freq = x.map(lambda val: value_counts[val])
    x[label_with_freq < threshold] = new_value
    return x

In [None]:
train_set.describe()

In [None]:
train_set[:5]

In [None]:
train_set.describe()

In [30]:
def preprocess(my_set, mappings = {}):
    user_ids = lump_levels2(my_set['user_id'].fillna(""), 0.000025,'other').astype('category')
    if ('user_id' not in mappings):
        user_ids_mapping = dict(enumerate(user_ids.cat.categories))
        user_ids_mapping_reverse = {v:k for k,v in user_ids_mapping.items()}
        mappings['user_id'] = user_ids_mapping_reverse
    
    regions = my_set['region'].astype('category')
    if ('region' not in mappings):
        regions_mapping = dict(enumerate(regions.cat.categories))
        regions_mapping_reverse = {v:k for k,v in regions_mapping.items()}
        mappings['region'] = regions_mapping_reverse

    cities = lump_levels2(my_set['city'].fillna(""), 0.0003,'other').astype('category')
    if ('city' not in mappings):
        cities_mapping = dict(enumerate(cities.cat.categories))
        cities_mapping_reverse = {v:k for k,v in cities_mapping.items()}
        mappings['city'] = cities_mapping_reverse

    parent_category_names = my_set['parent_category_name'].astype('category')
    if ('parent_category_name' not in mappings):
        parent_category_names_mapping = dict(enumerate(parent_category_names.cat.categories))
        parent_category_names_mapping_reverse = {v:k for k,v in parent_category_names_mapping.items()}
        mappings['parent_category_name'] = parent_category_names_mapping_reverse

    category_names = my_set['category_name'].astype('category')
    category_names_mapping = dict(enumerate(category_names.cat.categories))
    category_names_mapping_reverse = {v:k for k,v in category_names_mapping.items()}

    param_1 = my_set['param_1'].fillna("").astype('category')
    param_1_mapping = dict(enumerate(param_1.cat.categories))
    param_1_mapping_reverse = {v:k for k,v in param_1_mapping.items()}

    param_2 = my_set['param_2'].fillna("").astype('category')
    param_2_mapping = dict(enumerate(param_2.cat.categories))
    param_2_mapping_reverse = {v:k for k,v in param_2_mapping.items()}

    param_3 = lump_levels2(my_set['param_3'].fillna(""), 0.00005,'other').astype('category')
    param_3_mapping = dict(enumerate(param_3.cat.categories))
    param_3_mapping_reverse = {v:k for k,v in param_3_mapping.items()}

    text = my_set['title'].fillna("").str.cat(my_set['description'].fillna(""), sep=' ')
    text_length = text.map(lambda tt: len(tt))
    text_count = len(text.split())

    prices = np.log1p(my_set['price'].fillna(0))

    month = pd.to_datetime(my_set['activation_date'],format='%Y-%m-%d').map(lambda date: date.month)
    day = pd.to_datetime(my_set['activation_date'],format='%Y-%m-%d').map(lambda date: date.day)

    user_types = my_set['user_type'].fillna("").astype('category')
    user_types_mapping = dict(enumerate(user_types.cat.categories))
    user_types_mapping_reverse = {v:k for k,v in user_types_mapping.items()}

    df = pd.DataFrame()
    df['user_id'] = user_ids.cat.codes
    df['region'] = regions.cat.codes
    df['city'] = cities.cat.codes
    df['parent_category_name'] = parent_category_names.cat.codes
    df['category_name'] = category_names.cat.codes
    df['param_1'] = param_1.cat.codes
    df['param_2'] = param_2.cat.codes
    df['param_3'] = param_3.cat.codes
    df['text_length'] = text_length
    df['text_count'] = text_count
    df['text'] = text
    df['price'] = prices
    df['month'] = month
    df['day'] = day
    df['item_seq_number'] = train_set['item_seq_number'].fillna(-1)
    df['user_type'] = user_types.cat.codes
    df['image_top_1'] = train_set['image_top_1'].fillna(-1)
    

In [None]:
user_types_mapping

In [77]:
#region	city	parent_category_name	category_name	param_1	param_2	param_3	title	description	price	item_seq_number	activation_date	user_type	image	image_top_1


Y = train_set['deal_probability']
X = df
X[:3]

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,month,day,image_top_1
0,675853,19,460,4,42,249,0,0,-0.004592,2,1,3,28,1008.0
1,173962,17,1300,2,22,122,0,0,-0.004552,19,1,3,26,692.0
2,440069,16,1276,0,2,84,0,0,-0.004537,9,1,3,20,3032.0


In [79]:
regressor_train = RandomForestRegressor(n_jobs=20,n_estimators=20)
regressor_train.fit(X, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=20,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [58]:
regressor = RandomForestRegressor(n_jobs=20,n_estimators=10)
scores = cross_val_score(regressor, X, Y, cv=3,scoring='neg_mean_squared_error')

In [59]:
print(scores.mean())
print(scores.std())

-0.05877895347531897
8.353597679938909e-05


In [80]:
user_id_t = test_set['user_id'].astype('category').cat.rename_categories(user_ids_mapping_reverse)
regions_t = test_set['region'].astype('category').cat.rename_categories(regions_mapping_reverse)
cities_t = test_set['city'].astype('category').cat.rename_categories(cities_mapping_reverse)
parent_category_name_t = test_set['parent_category_name'].astype('category').cat.rename_categories(parent_category_names_mapping_reverse)
category_name_t = test_set['category_name'].astype('category').cat.rename_categories(category_names_mapping_reverse)
param_1_t = test_set['param_1'].fillna("").astype('category').cat.rename_categories(param_1_mapping_reverse)
param_2_t = test_set['param_2'].fillna("").astype('category').cat.rename_categories(param_2_mapping_reverse)
param_3_t = test_set['param_3'].fillna("").astype('category').cat.rename_categories(param_3_mapping_reverse)
prices_t = preprocessing.scale(test_set['price'].fillna(0))
user_types_t = test_set['user_type'].fillna("").astype('category')

month_t = pd.to_datetime(test_set['activation_date'],format='%Y-%m-%d').map(lambda date: date.month)
day_t = pd.to_datetime(test_set['activation_date'],format='%Y-%m-%d').map(lambda date: date.day)

df_t = pd.DataFrame()
df_t['user_id'] = user_id_t.cat.codes
df_t['region'] = regions_t.cat.codes
df_t['city'] = cities_t.cat.codes
df_t['parent_category_name'] = parent_category_name_t.cat.codes
df_t['category_name'] = category_name_t.cat.codes
df_t['param_1'] = param_1_t.cat.codes
df_t['param_2'] = param_2_t.cat.codes
df_t['param_3'] = param_3_t.cat.codes
df_t['price'] = prices_t
df_t['item_seq_number'] = test_set['item_seq_number']
df_t['user_type'] = user_types_t.cat.codes
df_t['month'] = month_t
df_t['day'] = day_t
df_t['image_top_1'] = test_set['image_top_1'].fillna(0)

X_t = df_t
X_t[:3]

Unnamed: 0,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,month,day,image_top_1
0,262938,4,306,4,10,108,177,63,-0.050567,66,1,4,18,2020.0
1,55145,19,933,8,5,117,0,0,-0.04999,4,1,4,16,0.0
2,13649,12,147,0,2,310,0,0,-0.047682,15,1,4,17,2960.0


In [81]:
Y_t = regressor_train.predict(X_t)

In [88]:
df_out = pd.DataFrame()
df_out["item_id"] = test_set["item_id"]
df_out["deal_probability"] = Y_t
df_out.to_csv('output2.csv', index=False, index_label=False)

In [90]:
df_out.describe()

Unnamed: 0,deal_probability
count,508438.0
mean,0.199514
std,0.117772
min,0.0
25%,0.112679
50%,0.179613
75%,0.265493
max,0.975


TRY tf idf

In [66]:
cities_t

0          317
1          983
2          150
3         1305
4          242
5         1276
6         1107
7          592
8         1048
9         1199
10        1153
11         723
12        1525
13        1300
14         231
15        1244
16        1276
17         723
18        1048
19        1398
20         975
21         961
22        1083
23        1526
24        1063
25         975
26         975
27        1101
28        1153
29         723
          ... 
508408     965
508409     940
508410     317
508411     575
508412    1048
508413     592
508414    1098
508415    1300
508416     962
508417    1153
508418     975
508419     743
508420     723
508421    1674
508422     857
508423    1589
508424    1589
508425    1632
508426      66
508427    1153
508428     317
508429    1048
508430     652
508431     317
508432    1589
508433     645
508434    1526
508435     123
508436     949
508437     329
Name: city, Length: 508438, dtype: category
Categories (1644, object): [0, 1, 2, 3, .

In [98]:
tfidf = feature_extraction.text.TfidfVectorizer()
texttfidf = tfidf.fit_transform(text.fillna(""))

In [99]:
texttfidf

<1503424x667794 sparse matrix of type '<class 'numpy.float64'>'
	with 32874835 stored elements in Compressed Sparse Row format>

In [100]:
texttfidf.shape

(1503424, 667794)

In [101]:
text.describe()

count          1387148
unique         1375410
top       Платье Новое
freq                78
Name: title, dtype: object

In [102]:
texttfidf[0]

<1x667794 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [103]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)