In [5]:
import warnings
from sklearn import preprocessing

import pandas as pd

import pylab as pl
import numpy as np

import matplotlib.pyplot as plt
from datetime import datetime

from sklearn import preprocessing
from sklearn import cross_validation

%matplotlib inline
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [4]:
def read_training_data(data_files):
    
    age_gender = pd.read_csv(data_files['age_gender'])
    countries = pd.read_csv(data_files['countries'])
    sessions = pd.read_csv(data_files['sessions'])
    train_users = pd.read_csv(data_files['users'])
    
    return age_gender, countries, sessions, train_users

#def

In [6]:
data_files = {'age_gender':'data/age_gender_bkts.csv',
'countries':'data/countries.csv',
'sessions':'data/sessions.csv',
'users':'data/train_users_2.csv'}

In [7]:
def clean_up(dataframe):

    dataframe.replace("-unknown-", np.nan, inplace = True)
    new_timestamp = dataframe.timestamp_first_active.astype(str)
    dates = pd.to_datetime(pd.Series([datetime.strptime(date, '%Y%m%d%H%M%S') for date in new_timestamp]))
    
    #easiness in using datetime objects
    dataframe['timestamp_first_active'] = dates
    dataframe['date_account_created'] = pd.to_datetime(dataframe.date_account_created)
    dataframe['date_first_booking'] = pd.to_datetime(dataframe.date_first_booking)
    dataframe.rename(columns = {'id':'user_id'}, inplace = True)
    df = dataframe.merge(sessions, on = 'user_id', how = 'left')
    user_ids = df['user_id']
    df_2 = df.drop('user_id', axis = 1)
    
    #sepparating in order to have numerical values for each
    
    #date_account_created
    df_2['dac_year'] = df_2.date_account_created.dt.year
    df_2['dac_month'] = df_2.date_account_created.dt.month
    df_2['dac_day'] = df_2.date_account_created.dt.day

    #timestamp_first_active
    df_2['tfa_year'] = df_2.timestamp_first_active.dt.year
    df_2['tfa_month'] = df_2.timestamp_first_active.dt.month
    df_2['tfa_day'] = df_2.timestamp_first_active.dt.day

    #date_first_booking
    df_2['dfb_year'] = df_2.date_first_booking.dt.year
    df_2['dfb_month'] = df_2.date_first_booking.dt.month
    df_2['dfb_day'] = df_2.date_first_booking.dt.day
    df_2 = df_2.drop(['date_account_created','timestamp_first_active','date_first_booking'],axis = 1)
    
    if 'country_destination' in list(df_2.columns):

        labels = df_2.country_destination
        df_2 = df_2.drop('country_destination', axis = 1)
        
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, labels, user_ids
        
    else:
        print df_2.shape[0], user_ids.shape[0] 
        return df_2, user_ids
    
    #if

#def

def encode_df(df_in,feature_list):
    
    #encoding all the non-numerical variables
    
    le = preprocessing.LabelEncoder()
    
    for item in feature_list:
        le.fit(df_in[item])
        encoded = le.transform(df_in[item])
        df_in[item] = encoded
    #for
    
    df_in = df_in.fillna(-1)
    
    return df_in

#def

def preprocessing_df(df_in,mode,normalizers=None):
    
    #
    # normalising the rest of the variables
    #
    
    if mode == 'train':
        scaler = preprocessing.MinMaxScaler().fit(df_in)
        df_in= scaler.transform(df_in)
        normalizers = scaler
        return df_in, normalizers
    elif mode == 'test':
        df_in = normalizers.transform(df_in)
        return df_in
    else:
        print 'Mode not defined {}'.format(mode)
    #if
    
#def

In [8]:
feature_list = ['gender','signup_method','language','affiliate_channel',
         'affiliate_provider','first_affiliate_tracked','affiliate_provider',
         'first_affiliate_tracked','signup_app','first_device_type','first_browser','action',
        'action_type','action_detail','device_type']

### Reading all the data
****

In [9]:
age_gender, countries, sessions, train_users = read_training_data(data_files)

### Preprocessing train data
****


In [10]:
train_set, train_labels, train_user_id = clean_up(train_users)

5677593 5677593


In [11]:
train_set = encode_df(train_set, feature_list)
train_set.shape[0]

5677593

In [12]:
train_set, normalizers = preprocessing_df(train_set,'train',normalizers=None)

In [13]:
encoded_dummies = pd.get_dummies(train_labels)
#encoded_dummies = pd.get_dummies(train_labels).as_matrix()
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()
train_labels_set = le.fit_transform(train_labels)
train_labels_set


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Splitting Training Data - train and CV
****

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_set, encoded_dummies, test_size=0.3, random_state=0)

In [None]:
print "training: %i, %i" % (X_train.shape[0],y_train.shape[0])
print "test: %i, %i" % (X_test.shape[0],y_test.shape[0])

### Training phase
****

In [19]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
# submission = DataFrame(columns=["id", "country"])

# # sort countries according to most probable destination country 
# for key in country_df['country'].value_counts().index:
#     submission = pd.concat([submission, country_df[country_df["country"] == key]], ignore_index=True)

In [None]:
y_pred = clf.predict_proba(X_test)
#formatting for submission


In [None]:
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(train_user_id)):
    idx = train_user_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [None]:
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.head()

# Process Again
****

In [None]:
print train_users.shape[0], train_set.shape[0]

### Reading Test Data
****

In [14]:
test_users = pd.read_csv('data/test_users.csv')
test_set, test_user_id = clean_up(test_users)


4995712 4995712


In [15]:
test_set = encode_df(test_set, feature_list)
test_set = preprocessing_df(test_set, 'test', normalizers = normalizers)
test_set.shape[0]

4995712

In [20]:
#training on the entire train set
clf = OneVsRestClassifier(RandomForestClassifier(random_state=0))
clf.fit(train_set, train_labels_set)

#predict probabilities of each country
y_pred = clf.predict_proba(test_set)

In [21]:
y_pred.shape[0]

4995712

In [22]:
last_df = pd.concat([test_user_id,pd.DataFrame(y_pred, columns = encoded_dummies.columns)], axis = 1)
last_df = last_df.groupby('user_id').mean().reset_index()


last_df

Unnamed: 0,user_id,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
0,0010k6l0om,0,0,0,0,0,0,0,1,0,0,0,0
1,0031awlkjq,0,0,0,0,0,0,0,1,0,0,0,0
2,00378ocvlh,0,0,0,0,0,0,0,1,0,0,0,0
3,0048rkdgb1,0,0,0,0,0,0,0,1,0,0,0,0
4,0057snrdpu,0,0,0,0,0,0,0,1,0,0,0,0
5,005v5uf4dh,0,0,0,0,0,0,0,1,0,0,0,0
6,0063bawn05,0,0,0,0,0,0,0,1,0,0,0,0
7,006ml14zc1,0,0,0,0,0,0,0,1,0,0,0,0
8,0075z9e9xv,0,0,0,0,0,0,0,1,0,0,0,0
9,00an0o6c07,0,0,0,0,0,0,0,1,0,0,0,0


In [25]:
import operator

ids = []
result = {}
for index, row in last_df.iterrows():
    country_values = row[1:].to_dict()
    sorted_vals = sorted(country_values.items(), key=operator.itemgetter(1), reverse = True)[:5]
    result[row[0]]= sorted_vals

In [26]:
some_other_df = pd.DataFrame(result).T
some_other_df.head()

Unnamed: 0,0,1,2,3,4
0010k6l0om,"(NDF, 1.0)","(FR, 0.0)","(NL, 0.0)","(PT, 0.0)","(CA, 0.0)"
0031awlkjq,"(NDF, 1.0)","(FR, 0.0)","(NL, 0.0)","(PT, 0.0)","(CA, 0.0)"
00378ocvlh,"(NDF, 1.0)","(FR, 0.0)","(NL, 0.0)","(PT, 0.0)","(CA, 0.0)"
0048rkdgb1,"(NDF, 1.0)","(FR, 0.0)","(NL, 0.0)","(PT, 0.0)","(CA, 0.0)"
0057snrdpu,"(NDF, 1.0)","(FR, 0.0)","(NL, 0.0)","(PT, 0.0)","(CA, 0.0)"


In [30]:
pd.DataFrame(some_other_df.unstack(0))\
.reset_index()\
.sort_values(['level_1','level_0'])[['level_1',0]].to_csv('something.csv', index = False)