# Reference  model
The first reference will be a model without any usage of knowledge.
The probability to end up in the class for a test customer is assumed to be *n(customers per class)/all(customers)*.

---

Using logistic loss evaluation the prediction scores:
* Naive entries per class model: 
   - loss: **2.42786222642**

In [16]:
import pandas as pd
from pandas import DataFrame as df
import seaborn as sns
import sys
import numpy as np

%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [3]:
sns.set_style('ticks')

In [262]:
sys.path.append("/home/mschlupp/pythonTools")
tmp = %pwd
files_dir = tmp + "/files/" 
solution_dir = tmp+'/predictions/'

In [5]:
train = pd.read_csv(files_dir+'gender_age_train.csv')
test = pd.read_csv(files_dir+'gender_age_test.csv')

In [104]:
groups = train.groupby('group').count()

In [105]:
groups.device_id = groups.device_id/len(train.age)

In [106]:
groups.device_id # show our naive prediction

group
F23-      0.067654
F24-26    0.056132
F27-28    0.041771
F29-32    0.062000
F33-42    0.074499
F43+      0.056186
M22-      0.100315
M23-26    0.128676
M27-28    0.072945
M29-31    0.097917
M32-38    0.126948
M39+      0.114957
Name: device_id, dtype: float64

In [17]:
# build the prediction matrix
prediction = np.zeros((len(train.age),len(groups.device_id)))

In [15]:
# Let us use the log_loss 
# (sklearn's logistic loss / cross-entropy) 
# implementation to score our prediction

# first transform group into numerical classes
labelEnc = LabelEncoder()
labelEnc.fit(train.group)
true_group = labelEnc.transform(train.group)

In [146]:
dg = df(columns=groups.index.values)
probs_per_group = dg.append(groups.device_id)

In [164]:
# assign our probabilities to the prediction array
for i in range(0,prediction.shape[0]):
    prediction[i]=probs_per_group.values[0]

In [167]:
print("Logistic loss of our prediction is: ")
print(log_loss(true_group,prediction))

Logistic loss of our prediction is: 
2.42786222642


# Prepare the submission
First create a matrix for the predictions of the test set.

In [194]:
prediction = np.zeros((len(test.device_id),len(groups.index.values)))
# assign our probabilities to the prediction array
for i in range(0,prediction.shape[0]):
    prediction[i]=probs_per_group.values[0]

#### Now define function that prepares the valid submission csv
It uses the test dataset and the prediction matrix as an input.

In [259]:
def prepareOutput(test, pred, label='talkingData'):
    '''
    Writes an valid submission file from the prediction matrix.
    The valid output must look like: 
    device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
    (id, probailities)

    Arguments:
    test  - the DataFrame with the device_id's to be tested
    pred  - is the prediction matrix with pred.shape = (len(test.device_id,len(unique groups))
    label - prefix of the submission file
    
    Return:
    The merged submission dataset is returned.
    '''
    p = pd.DataFrame(pred)
    p.columns = labelEnc.inverse_transform(p.columns)
    i = pd.DataFrame(test.device_id.values) 
    i.columns = ['device_id']
    merged= pd.concat([i,p], axis=1)
    merged.to_csv(files_dir+label+'_submission.csv', index=False)
    return merged

In [260]:
o = prepareOutput(test,prediction,'entriesPerClass')
%ls files/

app_events.csv                  label_categories.csv
app_labels.csv                  phone_brand_device_model.csv
entriesPerClass_submission.csv  phone_brand_device_model_engl.csv
events.csv                      sample_submission.csv
events_day_hour.csv             traintest_phone.csv
gender_age_test.csv             traintest_phone_evts.csv
gender_age_train.csv


In [261]:
o.head(2)

Unnamed: 0,device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
0,1002079943728939269,0.067654,0.056132,0.041771,0.062,0.074499,0.056186,0.100315,0.128676,0.072945,0.097917,0.126948,0.114957
1,-1547860181818787117,0.067654,0.056132,0.041771,0.062,0.074499,0.056186,0.100315,0.128676,0.072945,0.097917,0.126948,0.114957


#### This worked. The ouput can be submitted to kaggle.