In [10]:
%matplotlib inline

import numpy as np
import pandas as pd
import scipy as sp

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.grid_search import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
                                                    
import warnings
warnings.filterwarnings('ignore')

np.random.seed(44)

In [27]:
def get_hash_data(train, test):
    df = pd.concat((train, test), axis=0, ignore_index=True)
    split_len = len(train)

    # TF-IDF Feature
    tfv = TfidfVectorizer(min_df=3)
#     tfv = CountVectorizer(min_df=1, binary=1)
    df = df[['phone_brand', 'device_model', 'app_labels']].astype(np.str).apply(
        lambda x: ' '.join(s for s in x), axis=1).fillna("Missing")
    df_tfv = tfv.fit_transform(df)

    train = df_tfv[:split_len, :]
    test = df_tfv[split_len:, :]
    return train, test

In [28]:
# load files
train = pd.read_csv('../processed/train_processed.csv')
test = pd.read_csv('../processed/test_processed.csv')

In [29]:
## Encode categorical variables to numerical variables

lbl = LabelEncoder()

lbl.fit(train.group)
y = lbl.transform(train.group)

In [30]:
train, test = get_hash_data(train, test)

## Split into training and test set

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train, y, stratify=y, test_size=0.25, random_state=0)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(55983, 2045) (18662, 2045) (55983,) (18662,)


## Train the model

In [None]:
# tree = DecisionTreeClassifier(criterion='gini', max_depth=3, max_features='auto')
# tree.fit(X_train, y_train)

# tree = RandomForestClassifier(n_estimators=10, max_depth=7, max_features='auto', n_jobs=-1)
# tree.fit(X_train, y_train)

# tree = GradientBoostingClassifier()
# tree.fit(X_train, y_train)

# tree = XGBClassifier()
# tree.fit(X_train, y_train)

est = LogisticRegression(penalty='l1', C=10.)
est.fit(X_train, y_train)

print('Log Loss on the training set %f ' %(log_loss(y_train, est.predict_proba(X_train))))
print('Log Loss on the test set %f ' %(log_loss(y_test, est.predict_proba(X_test))))

## Cross Validation

In [None]:
skf = StratifiedKFold(y_train, n_folds=3, random_state=23)
cv_scores = cross_val_score(tree, X_train, y_train, scoring='log_loss', cv=skf, n_jobs=1)
print('Mean score %f and standard deviation %f '%(cv_scores.mean(), cv_scores.std()))

In [None]:
print('Score on unseen data %f '%(log_loss(y_test, tree.predict_proba(X_test))))

### Tuning Parameters

In [None]:
tuned_parameters = {
        'n_estimators': [100],
        'max_depth': [5, 6, 7, 8, 9, 10],
        'max_features': ['auto', 'sqrt']
    }

skf = StratifiedKFold(y=y_train, random_state=1)
clf = GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1), param_grid=tuned_parameters, cv=skf, scoring='log_loss')
clf.fit(X_train, y_train)

In [None]:
print clf.grid_scores_
print 
print clf.best_params_
print 
print clf.best_score_

In [None]:
print 'Log loss on unseen data %f '%(log_loss(y_test, clf.best_estimator_.predict_proba(X_test)))

In [20]:
# clf.best_estimator_.fit(X, y)

tree.fit(train, y)
predictions = tree.predict_proba(test)

In [21]:
submission = pd.read_csv('../data/sample_submission.csv'); submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
1,-1547860181818787117,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
2,7374582448058474277,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
3,-6220210354783429585,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833
4,-5893464122623104785,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833,0.0833


In [22]:
submission[submission.columns[1:]] = predictions

In [23]:
submission.head()

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.022274,0.039859,0.036342,0.065651,0.098476,0.07034,0.01524,0.073857,0.083236,0.12075,0.209848,0.164127
1,-1547860181818787117,0.017497,0.02054,0.022062,0.042982,0.086345,0.072651,0.048688,0.09205,0.067706,0.124762,0.182959,0.221757
2,7374582448058474277,0.07579,0.061242,0.042909,0.063825,0.072759,0.055896,0.109174,0.130521,0.071826,0.093065,0.117318,0.105676
3,-6220210354783429585,0.017497,0.02054,0.022062,0.042982,0.086345,0.072651,0.048688,0.09205,0.067706,0.124762,0.182959,0.221757
4,-5893464122623104785,0.07579,0.061242,0.042909,0.063825,0.072759,0.055896,0.109174,0.130521,0.071826,0.093065,0.117318,0.105676


In [24]:
submission.to_csv('../submissions/bag_of_words_decision_tree_2.184853.csv', index=False)