**HACKATHON**
***
Data Description

train - обучающая выборка   
test - по которой строится лидерборд   
*.pckl - это полные файлы (запикленные датафреймы)
*.pckl - это pickle-файлы при открытии которых открываются в датафреймы   
train_sample.pckl.zip - первые 100000 строк трейна (запикленные датафреймы)    

*вас может удивить большое число данных, это сделано специально в реальном мире вам доступна куча данных, потому вам всегда надо решать какие из них важны а какие нет   
трейн и тест фактически продублированы CSV файлами в которых есть только часть столбцов (чтоб можно было считать кусочками)   
так же есть первые 100000 строк трейна запикленных отдельно   
использовать все данные не всегда обязательно, не исключено наличие мусора который вам не понадобится.*   

train_4_col_2.csv - кусок TRAIN, который содержит id, category, subcategory (строки все)   
train_4_col_3.csv - кусок TRAIN, который содержит id, fields (строки все)   

test_3_col_2.csv - кусок TEST, который содержит id, category, subcategory (строки все)   
test_3_col_3.csv - кусок TEST, который содержит id, fields (строки все)   

train_sample.pckl - кусок TRAIN, который содержит все поля, но только первые 100000 строк   
train_4_col.csv - кусок TRAIN, который содержит id, name, description, price (строки все)   
test_4_col.csv - кусок TEST, который содержит id, name, description (строки все)   

In [1]:
import pandas as pd
import numpy as np

import pickle
import nltk
import re
import csv
import reverse_geocoder as rg
import pymorphy2

from pymorphy2.tagset import OpencorporaTag
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_log_error 

from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


  from numpy.core.umath_tests import inner1d


In [2]:
ma = pymorphy2.MorphAnalyzer()
parse_results = {}
latin = OpencorporaTag('LATN')

def get_parse_result(word):
    word = word.lower()
    if not (word in parse_results):
        pv = ma.parse(word)
        for p in pv:
            if p.tag.POS in ['ADJF', 'NOUN', 'VERB'] or p.tag == latin:
                parse_results[word] = p.normal_form
                break

    if not (word in parse_results):
        parse_results[word] = None

    return parse_results[word]


# getting words
def getMeaningfullWords(text):
    meaning_words = []
    clean_text = re.sub('—.*', '', text)
    all_words = re.findall('[А-ЯЁа-яёA-Za-z]{3,}', clean_text)

    for word in all_words:
        parse_result = get_parse_result(word)
        if parse_result is not None:
            meaning_words.append(parse_result)

    return meaning_words

# adding stop-words:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')    
count_vect = TfidfVectorizer(
    tokenizer=getMeaningfullWords,
    stop_words=stop_words,
    smooth_idf=True,
    sublinear_tf=True,
    min_df=0.025,
    max_df=0.7,
    norm='l2')

def df_to_cv(df, test): 
    
    #  !!!!!! train dataset will use fit_transform, but test dataset should use just transform:
    if not test:
        matrix_tfidf = count_vect.fit_transform(df['name']).toarray()
    else:
        matrix_tfidf = count_vect.transform(df['name']).toarray()

    df = df.join(
        pd.DataFrame(matrix_tfidf, columns=count_vect.get_feature_names()),
        how='left')
    df = df.drop('name', axis=1)
    
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikhail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preproc(data_frame, test):

    df = pd.DataFrame(data_frame)

    #   translate price to rubles:
    if not test:
        df['price'] = df['price'].map(lambda x: int(x / 100))
        df = df[df['price'] >= 0]
    
    #   drop ID's:
    df = df.drop('id', axis=1)

    #   True/False to 1/0 respectively:
    df['payment_available'] = df['payment_available'].map({False: 0, True: 1})

    #   parse coordinates and add cities with suburbs:
    coords = []
    for i, row in df.iterrows():
        coords.append(
            (row['location']['latitude'], row['location']['longitude']))
    coords = rg.search(coords)
    coords = pd.DataFrame(coords)
    coords = coords.rename(
        columns={'admin1': 'city', 'name': 'subcity'})
    df = df.join(coords[['subcity', 'lat', 'lon', 'city']])

    #   encode rare cities and suburbs into standalone groups
    df = df.apply(lambda x: x.mask(x.map(x.value_counts()) <
                                   110, 'other_city') if x.name == 'city' else x)
    df = df.apply(lambda x: x.mask(x.map(x.value_counts()) <= 10,
                                   'other_subcity') if x.name == 'subcity' else x)
    
    #  and encode cities and suburbs with their frequencies:
    df = df.apply(lambda x: x.map(x.value_counts())
                  if x.name == 'subcity' else x)
    df = df.apply(lambda x: x.map(x.value_counts()) if x.name == 'city' else x)

    #  clear descriptions from double spaces in words:
    df['description'] = df['description'].map(
        lambda x: x.lower().strip().replace("  ", " "))
    
    #  join names, description and subways (increasing significance):
    df['name'] = (df['name']+' ')*2
    df['name'] = df[['name', 'description']].apply(lambda x: ' '.join(x), axis=1)
    df['subway'] = df['subway'].fillna('').astype(str)
    df['name'] = df[['name', 'subway']].apply(lambda x: ' '.join(x), axis=1)

    #   replace description to his length:
    df['description'] = df['description'].map(lambda x: len(x))

    #   the same with images:
    df['images'] = df['images'].map(lambda x: len(x))

    #   amount of dictionaries with fields:
    df['fields'] = df['fields'].map(lambda x: len(x))
    
    #   drop columns with non-important data:
    df = df.drop(['can_buy', 'can_promote', 'contacts_visible', 'mortgage_available', 'delivery_available',
                  'fields', 'category', 'location', 'images', 'subway'], axis=1)

    return df.fillna(0)

***
# Testing #   

let's play with sample pickle

In [4]:
train_sample = pickle.load(open('data/train_sample.pckl', 'rb')).reset_index(drop=True)

In [5]:
#take a look around
train_sample.head()

Unnamed: 0,can_buy,can_promote,category,contacts_visible,date_created,delivery_available,description,fields,id,images,location,mortgage_available,name,payment_available,price,subcategory,subway
0,False,False,9,True,1492780671,False,,"[{'field': {'name': 'Женский гардероб', 'id': ...",3edeb34cf93f490ff760af85,"[{'id': '58fa06746c86cb4f22313452', 'num': 1, ...","{'latitude': 55.806888, 'longitude': 37.546077}",False,Сумка DG,True,199900,914,
1,False,False,22,True,1476824319,False,8-12 лет,"[{'field': {'name': 'Детский гардероб', 'id': ...",c98febd50dad3cc0ffc86085,"[{'id': '58068ccc04559f59bdbda92d', 'num': 1, ...","{'latitude': 55.692979, 'longitude': 37.872337}",False,Комплект,False,35000,2202,
2,False,False,22,True,1473004313,False,"На девочку 1,5 г,состояние хорошее","[{'field': {'name': 'Детский гардероб', 'id': ...",ade01e13912a46a99134cc75,"[{'id': '57cc42ecd53f3dcf17dc01c8', 'num': 1, ...","{'latitude': 55.639011, 'longitude': 37.349378}",False,Пальтишко демисезонное,False,30000,2204,
3,False,False,22,True,1476307221,False,"Размер-135mm, euro-22.5.Прочная, мягкая, не ск...","[{'field': {'name': 'Детский гардероб', 'id': ...",ab3e6941c11304c1519aef75,"[{'id': '580546528ae74be97723532e', 'num': 1, ...","{'latitude': 55.847334, 'longitude': 37.495834}",False,Attipas,True,80000,2209,
4,False,False,22,True,1503487787,False,,"[{'field': {'name': 'Детский гардероб', 'id': ...",252452a91c944a22c276d995,"[{'id': '599d66f3f235022f7411a535', 'num': 1, ...","{'latitude': 60.044826, 'longitude': 30.35546}",False,Жилет теплый,True,50000,2204,


In [6]:
#prepare the data
train_sample = train_sample.pipe(preproc, False).pipe(df_to_cv, False)

Loading formatted geocoded file...


In [9]:
# train on the part where price is existing:
part_df = train_sample[train_sample['price'] > 0]
X_part, y_part = part_df.drop('price', axis=1).values, part_df['price'].values
X, y = train_sample.drop('price', axis=1).values, train_sample['price'].values

In [10]:
X_part.shape

(97185, 44)

In [11]:
X.shape

(100000, 44)

# Using RandomForest #
'cos I like it I dunno why...

In [12]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=123, n_estimators=50)
param_grid = {'n_estimators': range(100,101)}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
rs = RandomizedSearchCV(rfr, param_distributions=param_grid, n_iter=1, n_jobs=-1,cv=cv, scoring='neg_mean_squared_log_error', random_state=123)

In [13]:
# let's roll...
rs.fit(X_part, y_part)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=123, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=1, n_jobs=-1,
          param_distributions={'n_estimators': range(100, 101)},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_log_error',
          verbose=0)

In [14]:
rs.best_params_

{'n_estimators': 100}

In [16]:
best_model = rs.best_estimator_

# Read test_hack and then make the prediction

In [17]:
test_hack = pickle.load(open('data/test_hack.pckl', 'rb')).reset_index(drop=True)

In [18]:
predict_file = pd.read_csv('data/submit_Sample.csv', delimiter=',', encoding='utf8', index_col='id')

In [19]:
predict_file.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
285ea2e9935ccdeb8378c6a5,0
adfb73820bbb831257df6e95,0
783025601c36202f633fc6a5,0
2f0cd2d2e15dc90afd847f95,0
5c23a37902855a20172845a5,0


In [20]:
# parse and prepare test_hack
test_hack = test_hack.pipe(preproc, True).pipe(df_to_cv, True)

In [21]:
test_hack.head().T

Unnamed: 0,0,1,2,3,4
date_created,1517061944.0,1508310389.0,1517089590.0,1509378271.0,1515487857.0
description,28.0,122.0,634.0,0.0,145.0
payment_available,1.0,1.0,0.0,1.0,0.0
subcategory,603.0,203.0,116.0,1009.0,1104.0
subcity,9506.0,576.0,1090.0,1657.0,2281.0
lat,54.74306,59.73833,55.98028,55.80961,59.84167
lon,55.96779,30.08944,37.135,37.78739,30.25583
city,16248.0,42337.0,91690.0,87647.0,42337.0
весь,0.0,0.0,0.580176,0.0,0.0
год,0.0,0.671227,0.40946,0.0,0.0


In [22]:
test_hack.shape

(749525, 44)

In [23]:
# finally make a prediction
predict_file['price'] = best_model.predict(test_hack)

In [25]:
# cast prices
predict_file['price'] = predict_file['price'].map(lambda x: int(x))
# and finally save the result
predict_file.to_csv('predict-v4.csv')