### Preparation

In [None]:
import time
notebookstart= time.time()

In [None]:
import numpy as np
import pandas as pd
import gc
import random

In [None]:
# Sci-Kit Learn Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from math import sqrt

# Light Gradient Boosting
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 
import re
import string

In [None]:
# Set parameters
random.seed(2018)
NFOLDS = 5
SEED = 2018

#### Define Functions (wrapper)

In [None]:
# define modeling function
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [None]:
# get out-of-fold predictions, the get_oof functions is used the get the out-of-fold predictions on the 
# validation set of that fold and the test set.
# The method was used in the belowed kaggle kernel and was very helpful in different kaggle competition.
# https://www.kaggle.com/mmueller/stacking-starter?scriptVersionId=390867
def get_oof(clf, x_train, y, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# text cleaning
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [None]:
# this competition uses RMSE score
def rmse(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power((y - y0), 2)))

In [None]:
# numerical to categorical preparation
# previouly in the data cleaing, -999 was used to deal with NA and other errors
def rep999(x):
    try:
        if x == -999:
            return -1
        else:
            return x
    except:
        return x

#### Load Data

##### the only difference between lgb and lgbcat is some numerical features are transformed into categorical feature in lgbcat

In [None]:
training = pd.read_csv('train.csv', index_col = "item_id",parse_dates = ["activation_date"])
traindex = training.index
# new features were extracted and put in to another file
newtrain = pd.read_csv('new_train_features.csv', header=0)

In [None]:
# Numerical to categorical transformation
newtrain=newtrain.apply(rep999)
newtrain["perform_white_analysis"]=pd.cut(newtrain['perform_white_analysis'], range(-5, 100, 5))
newtrain["perform_black_analysis"]=pd.cut(newtrain['perform_black_analysis'], range(-5, 100, 5))
newtrain["image_size"]=pd.cut(newtrain['image_size'], range(-100, 5000, 100))
newtrain["average_pixel_width"]=pd.cut(newtrain['average_pixel_width'], range(-2, 50, 1))
newtrain["get_blurrness_score"]=pd.cut(newtrain['get_blurrness_score'], range(-200,10000, 200))
newtrain[["average_red","average_green","average_blue"]]=newtrain[["average_red","average_green","average_blue"]].apply(lambda x: round(100*x,2))
newtrain["average_red"]=pd.cut(newtrain['average_red'], range(-110, 110, 10))
newtrain["average_green"]=pd.cut(newtrain['average_green'], range(-110, 110, 10))
newtrain["average_blue"]=pd.cut(newtrain['average_blue'], range(-110, 110, 10))
newtrain["rural"]=pd.cut(newtrain['rural'], range(-10, 100, 10))
newtrain["city_population"]=newtrain['city_population'].apply(lambda x: np.log(x+2))
newtrain["reg_Population"]=newtrain['reg_Population'].apply(lambda x: np.log(x+2))

In [None]:
# feature combination
training = pd.concat([training.reset_index(drop=True), newtrain], axis=1)
training.index=traindex

In [None]:
# loading test data
testing = pd.read_csv('test.csv', index_col = "item_id",parse_dates = ["activation_date"])
testdex = testing.index
# new features were extracted and put in to another file
newtest = pd.read_csv('new_test_features.csv', header=0)

In [None]:
# Numerical to categorical transformation
newtest=newtest.apply(rep999)
newtest["perform_white_analysis"]=pd.cut(newtest['perform_white_analysis'], range(-5, 100, 5))
newtest["perform_black_analysis"]=pd.cut(newtest['perform_black_analysis'], range(-5, 100, 5))
newtest["image_size"]=pd.cut(newtest['image_size'], range(-100, 5000, 100))
newtest["average_pixel_width"]=pd.cut(newtest['average_pixel_width'], range(-2, 50, 1))
newtest["get_blurrness_score"]=pd.cut(newtest['get_blurrness_score'], range(-200,10000, 200))
newtest[["average_red","average_green","average_blue"]]=newtest[["average_red","average_green","average_blue"]].apply(lambda x: round(100*x,2))
newtest["average_red"]=pd.cut(newtest['average_red'], range(-110, 110, 10))
newtest["average_green"]=pd.cut(newtest['average_green'], range(-110, 110, 10))
newtest["average_blue"]=pd.cut(newtest['average_blue'], range(-110, 110, 10))
newtest["rural"]=pd.cut(newtest['rural'], range(-10, 100, 10))
newtest["city_population"]=newtest['city_population'].apply(lambda x: np.log(x+2))
newtest["reg_Population"]=newtest['reg_Population'].apply(lambda x: np.log(x+2))

In [None]:
# feature combination
testing = pd.concat([testing.reset_index(drop=True), newtest], axis=1)
testing.index=testdex

In [None]:
# data shape check
ntrain = training.shape[0]
ntest = testing.shape[0]
y = training.deal_probability.copy()
training.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

In [None]:
# k-fold modeling
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [None]:
# training and testing concatenation
dfs=[training,testing]
df = pd.concat(dfs,axis=0)

In [None]:
# Create Validation Index and Remove Dead Variables
training_index = df.loc[df.activation_date<=pd.to_datetime('2017-04-07')].index
validation_index = df.loc[df.activation_date>=pd.to_datetime('2017-04-08')].index
df.drop(["activation_date","image"],axis=1,inplace=True)

#### Model Preparation

In [None]:
# catagorical variables encoding
print("\nEncode Variables")
categorical1 = ["perform_white_analysis","perform_black_analysis","image_size","average_pixel_width","get_blurrness_score","average_red","average_green","average_blue","rural"]
categorical2 = ["descsentiment","titlesentiment","user_id","region","city","parent_category_name","category_name","user_type","image_top_1","param_1","param_2","param_3","reg_Time_zone"]
categorical=categorical1+categorical2
print("Encoding :",categorical)

lbl = preprocessing.LabelEncoder()
for col in categorical1:
    df[col] = lbl.fit_transform(df[col].astype(str))
for col in categorical2:
    df[col].fillna('Unknown')
    df[col] = lbl.fit_transform(df[col].astype(str))

In [None]:
# tfidf setting
print("Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    "smooth_idf":False
}

In [None]:
# text cleaning
textfeats = ["description", "title"]
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df['title'] = df['title'].apply(lambda x: cleanName(x))
df["description"]   = df["description"].apply(lambda x: cleanName(x))

In [None]:
# text vectorizer
def get_col(col_name): return lambda x: x[col_name]

vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=17000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('title',CountVectorizer(
            ngram_range=(1, 2),
            stop_words = russian_stop,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])

In [None]:
start_vect=time.time()
vectorizer.fit(df.to_dict('records'))

In [None]:
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

In [None]:
# Drop Text Cols
textfeats = ["description", "title"]
df.drop(textfeats, axis=1,inplace=True)

#### Running Ridge

In [None]:
# parameters
ridge_params = {'alpha':30.0, 'fit_intercept':True, 'normalize':False, 'copy_X':True,
                'max_iter':None, 'tol':0.001, 'solver':'auto', 'random_state':SEED}

In [None]:
ridge = SklearnWrapper(clf=Ridge, seed = SEED, params = ridge_params)
ridge_oof_train, ridge_oof_test = get_oof(ridge, ready_df[:ntrain], y, ready_df[ntrain:])

In [None]:
# Ridge Results
# We will use ridge in our lgb to improve performance
rms = sqrt(mean_squared_error(y, ridge_oof_train))
print('Ridge OOF RMSE: {}'.format(rms))

In [None]:
# creating new feature
ridge_preds = np.concatenate([ridge_oof_train, ridge_oof_test])
df['ridge_preds'] = ridge_preds

In [None]:
# Combine Dense Features with Sparse Text Features
X = hstack([csr_matrix(df.loc[traindex,:].values),ready_df[0:traindex.shape[0]]])
testing = hstack([csr_matrix(df.loc[testdex,:].values),ready_df[traindex.shape[0]:]])
tfvocab = df.columns.tolist() + tfvocab
print("Feature Names Length: ",len(tfvocab))

### Start lgb

In [None]:
# parameters
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    # 'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.0175,
    'verbose': 0
}  

In [None]:
# LGB Dataset
lgtrain = lgb.Dataset(X, y,
                feature_name=tfvocab,
                categorical_feature = categorical)

In [None]:
# running lgb
# the process would take 3-4 hours
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    num_boost_round=2000,
    verbose_eval=100
)

### output results

In [None]:
lgpred = lgb_clf.predict(testing) 
lgbmodel = pd.DataFrame(lgpred,columns=["deal_probability"],index=testdex)

# The probability should be between 0 and 1
lgbmodel['deal_probability'].clip(0.0, 1.0, inplace=True)
lgbmodel.to_csv("lgb.csv",index=True,header=True)

print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))

##### The output was use to do model blending and was not submitted to kaggle directly
##### It's Validation RMSE is 0.01 higher than original lgb