In [1]:
import warnings
from sklearn import preprocessing
from xgboost.sklearn import XGBClassifier

import pandas as pd

import pylab as pl
import numpy as np

import matplotlib.pyplot as plt
from datetime import datetime

from sklearn import preprocessing
from sklearn import cross_validation

%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [2]:
def read_training_data(data_files):
    
    age_gender = pd.read_csv(data_files['age_gender'])
    countries = pd.read_csv(data_files['countries'])
    sessions = pd.read_csv(data_files['sessions'])
    train_users = pd.read_csv(data_files['users'])
    
    return age_gender, countries, sessions, train_users

#def

In [3]:
data_files = {'age_gender':'data/age_gender_bkts.csv',
'countries':'data/countries.csv',
'sessions':'data/sessions.csv',
'users':'data/train_users_2.csv'}

In [4]:
def clean_up(dataframe):

    dataframe.replace("-unknown-", np.nan, inplace = True)
    new_timestamp = dataframe.timestamp_first_active.astype(str)
    dates = pd.to_datetime(pd.Series([datetime.strptime(date, '%Y%m%d%H%M%S') for date in new_timestamp]))
    
    #easiness in using datetime objects
    dataframe['timestamp_first_active'] = dates
    dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
    dataframe['date_first_booking'] = pd.to_datetime(dataframe.date_first_booking)
    dataframe.rename(columns = {'id':'user_id'}, inplace = True)
    df = dataframe.merge(sessions, on = 'user_id', how = 'left')
    user_ids = df['user_id']
    df_2 = df.drop('user_id', axis = 1)
    
    #sepparating in order to have numerical values for each
    
    #date_account_created
    df_2['dac_year'] = df_2.date_account_created.dt.year
    df_2['dac_month'] = df_2.date_account_created.dt.month
    df_2['dac_day'] = df_2.date_account_created.dt.day

    #timestamp_first_active
    df_2['tfa_year'] = df_2.timestamp_first_active.dt.year
    df_2['tfa_month'] = df_2.timestamp_first_active.dt.month
    df_2['tfa_day'] = df_2.timestamp_first_active.dt.day

    #date_first_booking
#     df_2['dfb_year'] = df_2.date_first_booking.dt.year
#     df_2['dfb_month'] = df_2.date_first_booking.dt.month
#     df_2['dfb_day'] = df_2.date_first_booking.dt.day
    df_2 = df_2.drop(['date_account_created','timestamp_first_active','date_first_booking'],axis = 1)
    
    if 'country_destination' in list(df_2.columns):

        labels = df_2.country_destination
        df_2 = df_2.drop('country_destination', axis = 1)
        
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, labels, user_ids
        
    else:
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, user_ids
    
    #if

#def

def encode_df(df_in,feature_list):
    
    #encoding all the non-numerical variables
    
    le = preprocessing.LabelEncoder()
    
    for item in feature_list:
        le.fit(df_in[item])
        encoded = le.transform(df_in[item])
        df_in[item] = encoded
    #for
    
    df_in = df_in.fillna(-1)
    
    return df_in

#def

def preprocessing_df(df_in,mode,normalizers=None):
    
    #
    # normalising the rest of the variables
    #
    
    if mode == 'train':
        scaler = preprocessing.MinMaxScaler().fit(df_in)
        df_in= scaler.transform(df_in)
        normalizers = scaler
        return df_in, normalizers
    elif mode == 'test':
        df_in = normalizers.transform(df_in)
        return df_in
    else:
        print 'Mode not defined {}'.format(mode)
    #if
    
#def

In [5]:
feature_list = ['gender','signup_method','language','affiliate_channel',
         'affiliate_provider','first_affiliate_tracked','affiliate_provider',
         'first_affiliate_tracked','signup_app','first_device_type','first_browser','action',
        'action_type','action_detail','device_type']

### Reading all the data
****

In [6]:
age_gender, countries, sessions, train_users = read_training_data(data_files)

### Preprocessing train data
****


In [7]:
train_set, train_labels, train_user_id = clean_up(train_users)

5677593 5677593


In [8]:
train_set = encode_df(train_set, feature_list)
train_set.shape[0]

5677593

In [9]:
train_set, normalizers = preprocessing_df(train_set,'train',normalizers=None)

In [10]:
encoded_dummies = pd.get_dummies(train_labels)
#encoded_dummies = pd.get_dummies(train_labels).as_matrix()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_labels_set = le.fit_transform(train_labels)
train_labels_set


array([ 7,  7, 10, ...,  7,  7,  7])

### Splitting Training Data - train and CV
****

In [15]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_set, train_labels_set, test_size=0.3, random_state=0)

In [16]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

training: 3974315, 3974315
test: 1703278, 1703278


### Training phase
****

In [14]:
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
xgb.score(X_test, y_test)

In [None]:
clf.estimator.

In [None]:
from sklearn import metrics

print metrics.f1_score(y_test, y_pred)
print metrics.accuracy_score(y_test, y_pred)

In [None]:
# submission = DataFrame(columns=["id", "country"])

# # sort countries according to most probable destination country 
# for key in country_df['country'].value_counts().index:
#     submission = pd.concat([submission, country_df[country_df["country"] == key]], ignore_index=True)

# Process Again
****

In [18]:
print train_users.shape[0], train_set.shape[0]

213451 5677593


### Reading Test Data
****

In [11]:
test_users = pd.read_csv('data/test_users.csv')
test_set, test_user_id = clean_up(test_users)


4995712 4995712


In [12]:
test_set = encode_df(test_set, feature_list)
test_set = preprocessing_df(test_set, 'test', normalizers = normalizers)
test_set.shape[0]

4995712

In [13]:
#training on the entire train set
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
xgb.fit(train_set, train_labels_set)

#predict probabilities of each country


KeyboardInterrupt: 

In [None]:
y_pred = xgb.predict_proba(test_set)

In [None]:
y_pred.shape[0]

In [25]:
last_df = pd.concat([test_user_id,pd.DataFrame(y_pred, columns = encoded_dummies.columns)], axis = 1)
last_df = last_df.groupby('user_id').mean().reset_index()

last_df.head()


Unnamed: 0,user_id,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,0010k6l0om,0.002662,0.003294,0.003383,0.010376,0.14213,0.004775,0.006505,0.660242,0.005481,0.002781,0.126247,0.032122
1,0031awlkjq,0.001934,0.002873,0.003023,0.015783,0.022399,0.017558,0.024505,0.545665,0.005537,0.002944,0.331988,0.02579
2,00378ocvlh,0.003487,0.028285,0.006139,0.0198,0.037462,0.012328,0.029445,0.487668,0.007177,0.004101,0.318048,0.04606
3,0048rkdgb1,0.00133,0.001041,0.001195,0.129885,0.003299,0.001575,0.30682,0.44965,0.001371,0.001294,0.093213,0.009327
4,0057snrdpu,0.001897,0.002445,0.019717,0.005545,0.008332,0.003514,0.417869,0.364591,0.001758,0.001384,0.135747,0.0372


In [26]:
import operator

ids = []
result = {}
for index, row in last_df.iterrows():
    country_values = row[1:].to_dict()
    sorted_vals = sorted(country_values.items(), key=operator.itemgetter(1), reverse = True)[:5]
    result[row[0]]= sorted_vals

In [27]:
some_other_df = pd.DataFrame(result).T
some_other_df.head()

Unnamed: 0,0,1,2,3,4
0010k6l0om,"(NDF, 0.660241723061)","(FR, 0.142130047083)","(US, 0.126246646047)","(other, 0.0321221202612)","(ES, 0.0103762643412)"
0031awlkjq,"(NDF, 0.545664906502)","(US, 0.331988096237)","(other, 0.025790207088)","(IT, 0.0245048329234)","(FR, 0.0223994627595)"
00378ocvlh,"(NDF, 0.487668454647)","(US, 0.318047821522)","(other, 0.04605967924)","(FR, 0.0374619215727)","(IT, 0.0294449161738)"
0048rkdgb1,"(NDF, 0.449649810791)","(IT, 0.306819587946)","(ES, 0.129884615541)","(US, 0.0932128354907)","(other, 0.00932735390961)"
0057snrdpu,"(IT, 0.417868584394)","(NDF, 0.364591240883)","(US, 0.135746642947)","(other, 0.0371996089816)","(DE, 0.0197173804045)"


In [28]:
pd.DataFrame(some_other_df.unstack(0))\
.reset_index()\
.sort_values(['level_1','level_0'])[['level_1',0]].to_csv('something.csv', index = False)

### This returns a score of 84.53% accuracy on the website