In [6]:
import warnings
from sklearn import preprocessing

import pandas as pd

import pylab as pl
import numpy as np

import matplotlib.pyplot as plt
from datetime import datetime

from sklearn import preprocessing
from sklearn import cross_validation

%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [7]:
def read_training_data(data_files):
    
    age_gender = pd.read_csv(data_files['age_gender'])
    countries = pd.read_csv(data_files['countries'])
    sessions = pd.read_csv(data_files['sessions'])
    train_users = pd.read_csv(data_files['users'])
    
    return age_gender, countries, sessions, train_users

#def

In [8]:
data_files = {'age_gender':'data/age_gender_bkts.csv',
'countries':'data/countries.csv',
'sessions':'data/sessions.csv',
'users':'data/train_users_2.csv'}

In [9]:
def clean_up(dataframe):
    
    dataframe.replace("-unknown-", np.nan, inplace = True)
    new_timestamp = dataframe.timestamp_first_active.astype(str)
    dates = pd.to_datetime(pd.Series([datetime.strptime(date, '%Y%m%d%H%M%S') for date in new_timestamp]))
    
    #easiness in using datetime objects
    dataframe['timestamp_first_active'] = dates
    dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
    dataframe['date_first_booking'] = pd.to_datetime(dataframe.date_first_booking)
    dataframe.rename(columns = {'id':'user_id'}, inplace = True)
    df = dataframe.merge(sessions, on = 'user_id', how = 'left')
    user_ids = df['user_id']
    df_2 = df.drop('user_id', axis = 1)
    
    #sepparating in order to have numerical values for each
    
    #date_account_created
    df_2['dac_year'] = df_2.date_account_created.dt.year
    df_2['dac_month'] = df_2.date_account_created.dt.month
    df_2['dac_day'] = df_2.date_account_created.dt.day

    #timestamp_first_active
    df_2['tfa_year'] = df_2.timestamp_first_active.dt.year
    df_2['tfa_month'] = df_2.timestamp_first_active.dt.month
    df_2['tfa_day'] = df_2.timestamp_first_active.dt.day

    #date_first_booking
    df_2['dfb_year'] = df_2.date_first_booking.dt.year
    df_2['dfb_month'] = df_2.date_first_booking.dt.month
    df_2['dfb_day'] = df_2.date_first_booking.dt.day
    df_2 = df_2.drop(['date_account_created','timestamp_first_active','date_first_booking'],axis = 1)
    
    if 'country_destination' in list(df_2.columns):

        labels = df_2.country_destination
        df_2 = df_2.drop('country_destination', axis = 1)
        
        return df_2, labels, user_ids
        
    else:
         
        return df_2, user_ids
    
    #if

#def

def encode_df(df_in,feature_list):
    
    #encoding all the non-numerical variables
    
    le = preprocessing.LabelEncoder()
    
    for item in feature_list:
        le.fit(df_in[item])
        encoded = le.transform(df_in[item])
        df_in[item] = encoded
    #for
    
    df_in = df_in.fillna(-1)
    
    return df_in

#def

def preprocessing_df(df_in,mode='train',normalizers=None):
    
    #
    # normalising the rest of the variables
    #
    
    if mode == 'train':
        normalizers = {}
        for item in list(df_in.columns):
            scaler = preprocessing.MinMaxScaler()
            df_in[item] = scaler.fit_transform(df_in[item])
            normalizers[item] = scaler
        #for
    elif mode == 'test':
        for item in list(df_in.columns):
            df_in[item] = normalizers[item].transform(df_in[item])
        #for
    else:
        print 'Mode not defined {}'.format(mode)
    #if
        
    return df_in, normalizers
    
#def

In [10]:
feature_list = ['gender','signup_method','language','affiliate_channel',
         'affiliate_provider','first_affiliate_tracked','affiliate_provider',
         'first_affiliate_tracked','signup_app','first_device_type','first_browser','action',
        'action_type','action_detail','device_type']

In [14]:
age_gender, countries, sessions, train_users = read_training_data(data_files)

In [15]:
import warnings
warnings.filterwarnings('ignore')

train_set, train_labels, train_user_id = clean_up(train_users)

In [16]:
train_set = encode_df(train_set, feature_list)

In [17]:
train_set, normalizers = preprocessing_df(train_set,mode='train',normalizers=None)

In [21]:
encoded_dummies = pd.get_dummies(train_labels)
#encoded_dummies = pd.get_dummies(train_labels).as_matrix()

In [22]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_set, encoded_dummies, test_size=0.3, random_state=0)

In [23]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

training: 3974315, 3974315
test: 1703278, 1703278


In [24]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [25]:
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))

In [26]:
clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
          n_jobs=1)

In [27]:
clf.predict(X_test)

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [28]:
clf.score(X_test, y_test)

0.99241932321089099

In [None]:
x_test_probabilities = clf.predict_proba(X_test)

In [46]:
x_test_probabilities = pd.DataFrame(x_test_probabilities, columns = encoded_dummies.columns)
x_test_probabilities.head()

Unnamed: 0,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,0,0,0,0.0,0.0,0,0,0,0.0,0,1.0,0.0
1,0,0,0,0.4,0.1,0,0,0,0.1,0,0.7,0.3
2,0,0,0,0.0,0.0,0,0,1,0.0,0,0.0,0.0
3,0,0,0,0.0,0.0,0,0,0,0.0,0,1.0,0.0
4,0,0,0,0.0,0.0,0,0,1,0.0,0,0.0,0.0


In [48]:
X_test = X_test.drop('index', axis = 1)

In [49]:
X_test.head()

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,...,secs_elapsed,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,dfb_year,dfb_month,dfb_day
0,0.0,0.0,0,0.0,0.208333,0.714286,0.470588,1.0,0.666667,0.75,...,5.6e-05,1,0.363636,0.966667,1,0.363636,0.966667,0.999504,0.538462,0.65625
1,0.0,0.0,0,0.0,0.208333,0.285714,0.235294,1.0,0.666667,0.375,...,0.389233,1,0.363636,0.6,1,0.363636,0.6,0.999504,0.461538,0.625
2,0.0,0.0,0,0.96,0.208333,0.285714,0.235294,1.0,0.333333,0.5,...,0.000118,1,0.090909,0.7,1,0.090909,0.7,0.0,0.0,0.0
3,0.333333,0.014888,0,0.0,0.208333,0.285714,0.235294,1.0,0.666667,0.75,...,0.000434,1,0.363636,0.9,1,0.363636,0.9,1.0,0.384615,0.875
4,0.0,0.0,0,0.0,0.208333,0.714286,0.470588,0.571429,0.666667,0.375,...,4.1e-05,1,0.090909,0.7,1,0.090909,0.7,0.0,0.0,0.0


In [50]:
print X_test.shape[0]
print x_test_probabilities.shape[0]


1703278
1703278


In [54]:
df = pd.concat([X_test, x_test_probabilities], axis=1)

In [60]:
df['user_id'] = train_user_id
df_to_use = df[['user_id','AU','CA','DE','ES','FR','GB','IT','NDF','NL','PT','US','other']]
df_to_use.head()

Unnamed: 0,user_id,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,gxn3p5htnn,0,0,0,0.0,0.0,0,0,0,0.0,0,1.0,0.0
1,820tgsjxq7,0,0,0,0.4,0.1,0,0,0,0.1,0,0.7,0.3
2,4ft3gnwmtx,0,0,0,0.0,0.0,0,0,1,0.0,0,0.0,0.0
3,bjjt8pjhuk,0,0,0,0.0,0.0,0,0,0,0.0,0,1.0,0.0
4,87mebub9p4,0,0,0,0.0,0.0,0,0,1,0.0,0,0.0,0.0


In [99]:
df_averages = df_to_use.groupby('user_id').mean()


final_df = np.argsort(-df_averages).reset_index()
final_df.head()
# for user_id, row in df_averages.head().iterrows():
#     x = row.to_dict()
#     print {x for key,value in x.iteritems}



Unnamed: 0,user_id,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,0005ytdols,7,0,1,2,3,4,5,6,8,9,10,11
1,000guo2307,7,0,1,2,3,4,5,6,8,9,10,11
2,000wc9mlv3,7,0,1,2,3,4,5,6,8,9,10,11
3,0012yo8hu2,10,0,1,2,3,4,5,6,7,8,9,11
4,001357912w,7,0,1,2,3,4,5,6,8,9,10,11


In [122]:
submission = {}
for index, row in final_df.head().iterrows():
    country = {}
    country['AU']=row['AU']
    country['CA'] = row['CA']
    country['DE'] = row['DE']
    country['ES'] = row['ES']
    country['FR'] = row['FR']
    country['GB'] = row['GB']
    
    submission[row['user_id']] = country
    print submission

{'0005ytdols': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}}
{'0005ytdols': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '000guo2307': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}}
{'0005ytdols': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '000guo2307': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '000wc9mlv3': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}}
{'0005ytdols': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '0012yo8hu2': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 10, 'GB': 4, 'ES': 2}, '000guo2307': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '000wc9mlv3': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}}
{'0005ytdols': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '001357912w': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '0012yo8hu2': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 10, 'GB': 4, 'ES': 2}, '000guo2307': {'FR': 3, 'CA': 0, 'DE': 1, 'AU': 7, 'GB': 4, 'ES': 2}, '000wc9mlv3':

In [None]:
submission = DataFrame(columns=["id", "country"])

# sort countries according to most probable destination country 
for key in country_df['country'].value_counts().index:
    submission = pd.concat([submission, country_df[country_df["country"] == key]], ignore_index=True)

In [None]:
x_test_probs = clf.predict_proba(X_test)

In [None]:
np.argsort(-x_test_probs)[110:120]

In [None]:
y_test_pseudo = np.where(y_test)

In [None]:
np.histogram(y_test_pseudo[:][1])

In [None]:
print encoded_train_labels[1029]
print encoded_train_labels[46]
print len(encoded_train_labels[1029])

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

# test_users = pd.read_csv('data/test_users.csv')

In [None]:
# test_set = clean_up(test_users)
# test_set = encode_df(test_set)
# test_set, _ = preprocessing_df(train_set,mode=='test',normalizers=normalizers)

# Process Again
****

In [None]:
print train_users.shape[0], train_set.shape[0]

In [None]:
test_users = pd.read_csv('data/test_users.csv')
test_set, test_user_id = clean_up(test_users)
test_set = encode_df(test_set)
test_set, _ = preprocessing_df(train_set, mode = 'test', normalizers = normalizers)

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))
clf.fit(train_set, encoded_dummies)
test_predictions = clf.predict_proba(test_set)

In [None]:
test_predictions.head()