In [1]:
#basic python packages for plotting and array management
import numpy as np
import matplotlib.pyplot as plt
import random

#for data import
import pandas as pd
from __future__ import division
import os
import sys
from importlib import reload
reload(sys)

#Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

#Classifiers
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier

#Ensemble methods and crossvalidation
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier,RandomForestClassifier, VotingClassifier

#PostProcessing
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import time

# Loading DATA

In [2]:
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
X_train = pd.read_csv("X_train.csv")
X_test   = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")

# Comprehension of the Data

In [3]:
print("We will train our algorithm based on a set of %d orders, each with %d features."%(X_train.shape[0],X_train.shape[1]))
print("Then we will test it on a set of %d orders with the same number of features."%(X_test.shape[0]))
print("\nThe customers set consists of %d labels:" %(customers.shape[1]))
print(customers.columns)
print("\nThe products set consists of %d labels:" %(products.shape[1]))
print(products.columns)
print("\nThe x_training set consists of %d labels:" %(X_train.shape[1]))
print(X_train.columns)
print("\nThe y_training set consists of %d labels:" %(y_train.shape[1]))
print(y_train.columns)
print("\nThere are exactly %d orders not returned and %d orders returned" %((y_train['ReturnQuantityBin'] == 0).sum(),(y_train['ReturnQuantityBin'] == 1).sum() ))

We will train our algorithm based on a set of 1067290 orders, each with 20 features.
Then we will test it on a set of 800468 orders with the same number of features.

The customers set consists of 5 labels:
Index(['CustomerId', 'CountryISOCode', 'BirthDate', 'Gender',
       'FirstOrderDate'],
      dtype='object')

The products set consists of 25 labels:
Index(['VariantId', 'GenderLabel', 'MarketTargetLabel', 'SeasonLabel',
       'SeasonalityLabel', 'BrandId', 'UniverseLabel', 'TypeBrand',
       'ProductId', 'ProductType', 'SupplierColor', 'ProductColorId',
       'MinSize', 'MaxSize', 'CalfTurn', 'UpperHeight', 'HeelHeight',
       'PurchasePriceHT', 'IsNewCollection', 'SubtypeLabel',
       'UpperMaterialLabel', 'LiningMaterialLabel', 'OutSoleMaterialLabel',
       'RemovableSole', 'SizeAdviceDescription'],
      dtype='object')

The x_training set consists of 20 labels:
Index(['OrderNumber', 'VariantId', 'LineItem', 'CustomerId',
       'OrderStatusLabel', 'OrderTypelabel', 'Seas

In [4]:
customers.head()

Unnamed: 0,CustomerId,CountryISOCode,BirthDate,Gender,FirstOrderDate
0,14089083.0,SE,1979-02-05 00:00:00,Femme,2013-03-16 23:00:05
1,12862066.0,FR,1982-08-04 00:00:00,Femme,2012-02-14 17:47:33
2,14791699.0,FR,1965-04-02 00:00:00,Femme,2013-10-04 23:10:42
3,10794664.0,FR,1966-04-09 00:00:00,Femme,2010-03-25 18:46:59
4,15268576.0,ES,1980-04-22 00:00:00,Femme,2014-03-19 10:48:39


In [11]:
products.head()

Unnamed: 0,VariantId,GenderLabel,MarketTargetLabel,SeasonLabel,SeasonalityLabel,BrandId,UniverseLabel,TypeBrand,ProductId,ProductType,...,UpperHeight,HeelHeight,PurchasePriceHT,IsNewCollection,SubtypeLabel,UpperMaterialLabel,LiningMaterialLabel,OutSoleMaterialLabel,RemovableSole,SizeAdviceDescription
0,728257.0,Homme,Classique,Automne/Hiver,Saisonnier,66.0,DÃ©tente,Standard,17267.0,Baskets,...,,0.0,30.5,0.0,Montantes,,,,False,Prenez une taille en dessous de votre pointure...
1,806356.0,Femme,ND,Automne/Hiver,Saisonnier,842.0,ND,Standard,30824.0,Baskets,...,0.0,0.0,43.0,0.0,Montantes,,,,True,Prenez votre pointure habituelle
2,768790.0,Femme,ND,Automne/Hiver,Reconduit,988.0,Ville,Standard,62475.0,Bottines et boots,...,6.0,3.0,54.9,0.0,Bout pointu,,,,False,Prenez votre pointure habituelle
3,515679.0,Femme,ND,Automne/Hiver,Saisonnier,769.0,Ville,Standard,43983.0,Escarpins,...,0.0,13.0,34.5,0.0,Bout rond,,,,False,Prenez votre pointure habituelle
4,1025246.0,Femme,ND,Automne/Hiver,Saisonnier,1244.0,ND,Standard,81493.0,Bottines et boots,...,8.0,4.0,43.76,0.0,Bout rond,,,,False,Prenez votre pointure habituelle


In [4]:
index = pd.DataFrame({'index':np.arange(X_train.shape[0])})
X_train['index']=index # Evite les duplicata
X_train.head()

Unnamed: 0,OrderNumber,VariantId,LineItem,CustomerId,OrderStatusLabel,OrderTypelabel,SeasonLabel,PayementModeLabel,CustomerTypeLabel,IsoCode,...,PricingTypeLabel,TotalLineItems,Quantity,UnitPMPEUR,OrderCreationDate,OrderShipDate,OrderNumCustomer,IsOnSale,BillingPostalCode,index
0,73521754,439729,1,12443972,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,Plein Tarif,2,1,5264,2011-10-26 12:10:48,2011-10-26 18:27:00,1,0.0,87000,0
1,73521754,440174,2,12443972,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,Plein Tarif,2,1,5264,2011-10-26 12:10:48,2011-10-26 18:27:00,1,0.0,87000,1
2,73525226,494501,1,12443958,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,Plein Tarif,1,1,1317,2011-10-26 12:11:38,2011-10-26 17:48:00,1,0.0,77700,2
3,73529009,439590,1,12443946,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,Plein Tarif,2,1,564,2011-10-26 12:13:09,2011-10-26 17:59:00,1,0.0,44600,3
4,73529009,559476,2,12443946,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,Plein Tarif,2,1,37,2011-10-26 12:13:09,2011-10-26 17:59:00,1,0.0,44600,4


In [5]:
index = pd.DataFrame({'index':np.arange(y_train.shape[0])})
y_train['index']=index # Evite les duplicata, va servir pour le join avec X
y_train.head()

Unnamed: 0,OrderNumber,LineItem,ReturnQuantity,ReturnQuantityBin,index
0,73521754,1,0.0,0.0,0
1,73521754,2,0.0,0.0,1
2,73525226,1,1.0,1.0,2
3,73529009,1,1.0,1.0,3
4,73529009,2,1.0,1.0,4


In [13]:
X_Product = pd.merge(X_train,products,on='VariantId')
X_Product = X_Product.sort_values(by='index')
X_Product = X_Product[:100000] #Reduce computing
X_Product.head()

Unnamed: 0,OrderNumber,VariantId,LineItem,CustomerId,OrderStatusLabel,OrderTypelabel,SeasonLabel_x,PayementModeLabel,CustomerTypeLabel,IsoCode,...,UpperHeight,HeelHeight,PurchasePriceHT,IsNewCollection,SubtypeLabel,UpperMaterialLabel,LiningMaterialLabel,OutSoleMaterialLabel,RemovableSole,SizeAdviceDescription
0,73525226,494501,1,12443958,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,12.0,8.0,15.5,0.0,Bout rond,,,,False,Prenez votre pointure habituelle
5,73529009,559476,2,12443946,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,5.0,9.0,37.0,0.0,Bout rond,Cuir,,,False,Prenez votre pointure habituelle
12,73546547,536093,1,12443986,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,0.0,3.0,39.110001,0.0,Bout rond,Cuir,,,False,Prenez votre pointure habituelle
13,73572913,523440,1,10657544,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Fidélisé,FR,...,0.0,9.0,49.619999,0.0,,,,,False,"ModÃ¨le Ã©troit, convient aux pieds fins"
15,73609935,497021,2,12444010,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Nouveau,FR,...,8.0,8.0,21.9,0.0,Talon compensÃ©,,,,False,Prenez votre pointure habituelle


In [14]:
y_Product = pd.merge(y_train,X_Product,on="index")
y_Product = y_Product.sort_values(by='index')
y_Product = y_Product[:100000]
y_Product.head()

Unnamed: 0,OrderNumber_x,LineItem_x,ReturnQuantity,ReturnQuantityBin,index,OrderNumber_y,VariantId,LineItem_y,CustomerId,OrderStatusLabel,...,UpperHeight,HeelHeight,PurchasePriceHT,IsNewCollection,SubtypeLabel,UpperMaterialLabel,LiningMaterialLabel,OutSoleMaterialLabel,RemovableSole,SizeAdviceDescription
0,73525226,1,1.0,1.0,2,73525226,494501,1,12443958,Expédié,...,12.0,8.0,15.5,0.0,Bout rond,,,,False,Prenez votre pointure habituelle
1,73529009,2,1.0,1.0,4,73529009,559476,2,12443946,Expédié,...,5.0,9.0,37.0,0.0,Bout rond,Cuir,,,False,Prenez votre pointure habituelle
2,73546547,1,0.0,0.0,7,73546547,536093,1,12443986,Expédié,...,0.0,3.0,39.110001,0.0,Bout rond,Cuir,,,False,Prenez votre pointure habituelle
3,73572913,1,1.0,1.0,14,73572913,523440,1,10657544,Expédié,...,0.0,9.0,49.619999,0.0,,,,,False,"ModÃ¨le Ã©troit, convient aux pieds fins"
4,73609935,2,1.0,1.0,20,73609935,497021,2,12444010,Expédié,...,8.0,8.0,21.9,0.0,Talon compensÃ©,,,,False,Prenez votre pointure habituelle


In [5]:
print(type(X_train))
C = products.corr()
#print((C))

<class 'pandas.core.frame.DataFrame'>


In [28]:
index = pd.DataFrame({'index':np.arange(X_test.shape[0])})
X_test['index']=index # Evite les duplicata
X_test_Product = pd.merge(X_test,products,on='VariantId')
X_test_Product = X_test_Product.sort_values(by='index')
X_test_Product.head()

Unnamed: 0,OrderNumber,VariantId,LineItem,CustomerId,OrderStatusLabel,OrderTypelabel,SeasonLabel_x,PayementModeLabel,CustomerTypeLabel,IsoCode,...,UpperHeight,HeelHeight,PurchasePriceHT,IsNewCollection,SubtypeLabel,UpperMaterialLabel,LiningMaterialLabel,OutSoleMaterialLabel,RemovableSole,SizeAdviceDescription
0,90621016,1043714,1,16121040,Expédié,DIRECT,Automne/Hiver,Klarna_Invoice,Fidélisé,DE,...,3.0,2.0,32.7,0.0,Bout rond,,,,False,Prenez votre pointure habituelle
7,90641377,1300698,1,16271314,Expédié,DIRECT,Automne/Hiver,PayPal,Nouveau,DE,...,7.0,3.0,35.900002,0.0,Bout rond,,,,True,Prenez une taille en dessous de votre pointure...
9,90652456,1392714,1,16251703,Expédié,DIRECT,Automne/Hiver,BankTransfer_IBAN,Nouveau,ES,...,,2.0,23.98,0.0,Montantes,,,,False,Prenez votre pointure habituelle
12,90653392,542478,3,12503589,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Fidélisé,FR,...,,0.0,18.5,0.0,Basses,,,,False,
22,90653392,1324493,1,12503589,Expédié,DIRECT,Automne/Hiver,Carte bancaire,Fidélisé,FR,...,,0.0,16.75,0.0,Basses,,,,False,Prenez votre pointure habituelle


# Defining a feature transformation

In [11]:
features = ["OrderNumber", "VariantId", "LineItem", "CustomerId",
       "OrderStatusLabel", "OrderTypelabel", "SeasonLabel",
       "PayementModeLabel", "CustomerTypeLabel", "IsoCode", "DeviceTypeLabel",
       "PricingTypeLabel", "TotalLineItems", "Quantity", "UnitPMPEUR",
       "OrderCreationDate", "OrderShipDate", "OrderNumCustomer", "IsOnSale",
       "BillingPostalCode"]

In [17]:
F = ["OrderCreationDate","OrderNumber","VariantId", "CustomerId","OrderCreationDate","OrderShipDate","BillingPostalCode","index","MinSize", "MaxSize", "CalfTurn", "UpperHeight"]

In [18]:
def funk_mask(d,masked_features):
    " Defining a simple mask over the input data "
    columns_ext = masked_features
    X1 = d.loc[:,[xx for xx in d.columns if xx not in columns_ext]]
    print("1")
    g = lambda x: x.replace(",",".")
    print("2")
    X1.UnitPMPEUR = map(np.float64,(map(g,X1.UnitPMPEUR)))
    print("3")
    columns2bin = [x for x in X1.columns if X1[x].dtype == np.dtype('O')]
    print("4")
    X2 = pd.get_dummies(X1.loc[:,columns2bin])
    print("5")
    X1 = X1.loc[:,[xx for xx in X1.columns if xx not in columns2bin]]
    print("6")
    res = pd.concat([X1,X2],axis=1)
    print("7")
    res = res.fillna(0)
    return(res)

# Applying the mask

In [29]:
x1 = funk_mask(X_Product,F)
x2 = funk_mask(X_test_Product,F)
seleckt_columns = np.intersect1d(x1.columns,x2.columns)
x1 = x1.loc[:,seleckt_columns]
x2 = x2.loc[:,seleckt_columns]

1
2
3
4
5
6
7
1
2
3
4
5
6
7


# Supervised learning : Logistic regression model

In [30]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(x1.iloc[:50000], y_Product.ReturnQuantityBin[:50000])
y_tosubmit = clf.predict_proba(x2.loc[:,x1.columns])
score_forest = np.mean(cross_val_score(clf,x1.iloc[:50000], y_Product.ReturnQuantityBin[:50000],cv=10))
print(score_forest)

0.753480246025


In [31]:
print(y_tosubmit[:,1].shape)
np.savetxt('y_pred.txt', y_tosubmit[:,1], fmt='%f')

(566372,)


# Score of our prediction : on the train

In [23]:
yres = clf.predict_proba(x1.loc[:100000,x1.columns])
roc_auc_score(y_Product.ReturnQuantityBin.iloc[:100000],yres[:,1])

# Submission to the system
np.savetxt('y_pred.txt', y_tosubmit[:,1], fmt='%f')

ValueError: Found input variables with inconsistent numbers of samples: [100000, 67335]