In [54]:
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import grid_search
# import xgboost as xgb
# from xgboost.sklearn import XGBClassifier

In [55]:
zf = zipfile.ZipFile('./data/train_users_2.csv.zip')
df_train = pd.read_csv(zf.open('train_users_2.csv'))

zf = zipfile.ZipFile('./data/test_users.csv.zip')
df_test = pd.read_csv(zf.open('test_users.csv'))

In [56]:
zf = zipfile.ZipFile('./data/sessions.csv.zip')
df_session = pd.read_csv(zf.open('sessions.csv'))

In [57]:
zf = zipfile.ZipFile('./data/age_gender_bkts.csv.zip')
df_age_gender = pd.read_csv(zf.open('age_gender_bkts.csv'))

In [58]:
# zf = zipfile.ZipFile('./data/countries.csv.zip')
# df_countries = pd11.read_csv(zf.open('countries.csv'))

## Exploração dos dados



plt.figure(figsize=(20,8))
sns.barplot(x='mean_age', y='population_in_thousands', hue='gender', data=df_age_gender, ci=None)

## Preparação dos dados

##### Juntar dados de treinamento e teste.

In [59]:
# df_train = df_train.drop(['country_destination'], axis=1)

In [103]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [104]:
df_all.columns

Index([u'affiliate_channel', u'affiliate_provider', u'age',
       u'country_destination', u'date_account_created', u'date_first_booking',
       u'first_affiliate_tracked', u'first_browser', u'first_device_type',
       u'gender', u'id', u'language', u'signup_app', u'signup_flow',
       u'signup_method', u'timestamp_first_active'],
      dtype='object')

### Tratamento de Idiomas

In [105]:
df_all.language.unique()

array(['en', 'fr', 'de', 'es', 'it', 'pt', 'zh', 'ko', 'ja', 'ru', 'pl',
       'el', 'sv', 'nl', 'hu', 'da', 'id', 'fi', 'no', 'tr', 'th', 'cs',
       'hr', 'ca', 'is', '-unknown-'], dtype=object)

Substituindo gêneros "-unknown-" por "MISSING"

In [106]:
df_all.language.replace('-unknown-', 'MISSING_LANGUAGE', inplace=True)

### Tratamento de Gêneros

In [107]:
df_all.gender.unique()

array(['-unknown-', 'MALE', 'FEMALE', 'OTHER'], dtype=object)

Substituindo gêneros "-unknown-" por "MISSING"

In [108]:
df_all.gender.replace('-unknown-', 'MISSING_GENDER', inplace=True)

In [109]:
df_all[df_all.gender == 'MISSING_GENDER'].shape[0]

129480

### Tratamento de Idades

In [110]:
user_with_year_age_mask = df_all['age'] > 1000

In [111]:
df_all.loc[user_with_year_age_mask, 'age'] = 2015 - df_all.loc[user_with_year_age_mask, 'age']

In [112]:
df_all.loc[(df_all['age'] > 100) | (df_all['age'] < 18), 'age'] = -1

In [113]:
df_all['age'].fillna(-1, inplace=True)

In [114]:
bins = [-1, 21, 25, 30, 35, 40, 50, 60, 75, 100]
df_all['age_group'] = np.digitize(df_all['age'], bins, right=True)

In [115]:
df_all.head()

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,id,language,signup_app,signup_flow,signup_method,timestamp_first_active,age_group
0,direct,direct,-1.0,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,MISSING_GENDER,gxn3p5htnn,en,Web,0,facebook,20090319043255,0
1,seo,google,38.0,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,820tgsjxq7,en,Web,0,facebook,20090523174809,5
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,4ft3gnwmtx,en,Web,3,basic,20090609231247,7
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,bjjt8pjhuk,en,Web,0,facebook,20091031060129,6
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,MISSING_GENDER,87mebub9p4,en,Web,0,basic,20091208061105,6


### Apagando colunas

In [116]:
drop_features = [
    'age',
    'date_account_created',
    'date_first_booking',
    'timestamp_first_active',
    'affiliate_provider',
    'affiliate_channel',
    'first_affiliate_tracked',
    'first_browser'
]

df_all.drop(drop_features, axis=1, inplace=True)
df_all.columns

Index([u'country_destination', u'first_device_type', u'gender', u'id',
       u'language', u'signup_app', u'signup_flow', u'signup_method',
       u'age_group'],
      dtype='object')

### Adicionando colunas

In [117]:
total_seconds = df_session.groupby('user_id')['secs_elapsed'].sum()

In [118]:
num_devices = df_session.groupby('user_id')['device_type'].nunique()

In [119]:
distinct_sessions = df_session.groupby('user_id')['action'].nunique()

In [120]:
bigger_session = df_session.groupby('user_id')['secs_elapsed'].max()

In [121]:
def session_features(df):
    df['total_seconds'] = df['id'].apply(lambda x: total_seconds[x] if x in total_seconds else 0)
    df['num_devices'] = df['id'].apply(lambda x: num_devices[x] if x in num_devices else 0)
    df['distinct_sessions'] = df['id'].apply(lambda x: distinct_sessions[x] if x in distinct_sessions else 0)
    df['bigger_session'] = df['id'].apply(lambda x: bigger_session[x] if x in bigger_session else 0)
    return df

In [122]:
df_all.columns

Index([u'country_destination', u'first_device_type', u'gender', u'id',
       u'language', u'signup_app', u'signup_flow', u'signup_method',
       u'age_group'],
      dtype='object')

In [123]:
categoricals = ['first_device_type', 'gender', 'signup_method', 'signup_flow', 'language', 'signup_app']

df_all = pd.get_dummies(df_all, columns=categoricals, prefix='is')

In [124]:
df_all.head()

Unnamed: 0,country_destination,id,age_group,is_Android Phone,is_Android Tablet,is_Desktop (Other),is_Mac Desktop,is_Other/Unknown,is_SmartPhone (Other),is_Windows Desktop,...,is_pt,is_ru,is_sv,is_th,is_tr,is_zh,is_Android,is_Moweb,is_Web,is_iOS
0,NDF,gxn3p5htnn,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,NDF,820tgsjxq7,5,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,US,4ft3gnwmtx,7,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,other,bjjt8pjhuk,6,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,US,87mebub9p4,6,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### BACKUP df_all

In [125]:
bkp = df_all

In [126]:
df_all = bkp

In [127]:
df_all.set_index('id', inplace=True)

In [128]:
train_processed = df_all.loc[df_train['id']]

In [129]:
test_processed = df_all.loc[df_test['id']]

In [130]:
train_processed.shape[0]

213451

In [131]:
test_processed.shape[0]

62096

In [132]:
train_processed.columns

Index([u'country_destination', u'age_group', u'is_Android Phone',
       u'is_Android Tablet', u'is_Desktop (Other)', u'is_Mac Desktop',
       u'is_Other/Unknown', u'is_SmartPhone (Other)', u'is_Windows Desktop',
       u'is_iPad', u'is_iPhone', u'is_FEMALE', u'is_MALE',
       u'is_MISSING_GENDER', u'is_OTHER', u'is_basic', u'is_facebook',
       u'is_google', u'is_weibo', u'is_0', u'is_1', u'is_2', u'is_3', u'is_4',
       u'is_5', u'is_6', u'is_8', u'is_10', u'is_12', u'is_14', u'is_15',
       u'is_16', u'is_20', u'is_21', u'is_23', u'is_24', u'is_25',
       u'is_MISSING_LANGUAGE', u'is_ca', u'is_cs', u'is_da', u'is_de',
       u'is_el', u'is_en', u'is_es', u'is_fi', u'is_fr', u'is_hr', u'is_hu',
       u'is_id', u'is_is', u'is_it', u'is_ja', u'is_ko', u'is_nl', u'is_no',
       u'is_pl', u'is_pt', u'is_ru', u'is_sv', u'is_th', u'is_tr', u'is_zh',
       u'is_Android', u'is_Moweb', u'is_Web', u'is_iOS'],
      dtype='object')

In [133]:
y = train_processed['country_destination']
train_processed.drop(['country_destination'], axis=1, inplace=True)
X = train_processed.values

## Machine Learning

##### NDCG Scorer
A custom NDCG scorer from kaggle scripts will be used for training.

In [134]:
# def ndcg5_score(preds, dtrain):
#     labels = dtrain.get_label()
#     top = []

#     for i in range(preds.shape[0]):
#         top.append(np.argsort(preds[i])[::-1][:5])

#     mat = np.reshape(np.repeat(labels,np.shape(top)[1]) == np.array(top).ravel(),np.array(top).shape).astype(int)
#     score = np.mean(np.sum(mat/np.log2(np.arange(2, mat.shape[1] + 2)),axis = 1))
#     return 'ndcg5', score

In [142]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.
    
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.
        
    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    #lb.fit(range(len(predictions) + 1))  ## original
    #lb.fit(range(predictions.shape[1] + 1))
    #T = lb.transform(ground_truth)
    
    T = lb.fit_transform(ground_truth) 

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.02, random_state=42)

### Random Forest Classifier

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
clf = RandomForestClassifier(n_estimators=25, random_state=101)

In [3]:
parameters = {'min_samples_split': [2, 20],
              'max_depth': [6, 12]
             }
reg = grid_search.GridSearchCV(clf, parameters, scoring=ndcg_scorer, cv=3)

NameError: name 'grid_search' is not defined

In [None]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# ye = le.fit_transform(y)

In [6]:
reg.fit(X, y)

NameError: name 'reg' is not defined

In [7]:
est = reg.best_estimator_
est

NameError: name 'reg' is not defined

In [4]:
reg.grid_scores_

NameError: name 'reg' is not defined

In [5]:
print 'Random Forest:\n', metrics.classification_report(y, est.predict(X))

Random Forest:


NameError: name 'y' is not defined

### XGBOOST


Train XGBoost classifier
XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework, which produces an ensemble of weak decision tree learners via additive training (boosting).

The advantages of Gradient Tree Boosting are:

Natural handling of data of mixed type (= heterogeneous features)
Predictive power
Robustness to outliers in output space (via robust loss functions)
XGBoost is 20x faster than scikit-learn's Gradient Boosting Classifier and was used in the winning solution of kaggle's Higgs competition.

We will use the sklearn interface for XGBoost to train our model.

https://github.com/kcbighuge/p5-capstone/blob/master/notebooks/p5-capstone-airbnb.ipynb

## Conclusão e resultados