In [1]:
#basic python packages for plotting and array management
import numpy as np
import matplotlib.pyplot as plt
import random

#for data import
import pandas as pd
from __future__ import division
import os
import sys
from importlib import reload
reload(sys)

#Preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

#Classifiers
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier

#Ensemble methods and crossvalidation
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier,RandomForestClassifier, VotingClassifier

#PostProcessing
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import time

# Loading DATA

In [22]:
customers = pd.read_csv("customers.csv")
products = pd.read_csv("products.csv")
X_train = pd.read_csv("X_train.csv")
X_test   = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv")

# Comprehension of the Data

In [26]:
print("We will train our algorithm based on a set of %d orders, each with %d features."%(X_train.shape[0],X_train.shape[1]))
print("Then we will test it on a set of %d orders with the same number of features."%(X_test.shape[0]))
print("\nThe customers set consists of %d labels:" %(customers.shape[1]))
print(customers.columns)
print("\nThe products set consists of %d labels:" %(products.shape[1]))
print(products.columns)
print("\nThe x_training set consists of %d labels:" %(X_train.shape[1]))
print(X_train.columns)
print("\nThe y_training set consists of %d labels:" %(y_train.shape[1]))
print(y_train.columns)
print("\nThere are exactly %d orders not returned and %d orders returned" %((y_train['ReturnQuantityBin'] == 0).sum(),(y_train['ReturnQuantityBin'] == 1).sum() ))

We will train our algorithm based on a set of 1067290 orders, each with 20 features.
Then we will test it on a set of 800468 orders with the same number of features.

The customers set consists of 5 labels:
Index(['CustomerId', 'CountryISOCode', 'BirthDate', 'Gender',
       'FirstOrderDate'],
      dtype='object')

The products set consists of 25 labels:
Index(['VariantId', 'GenderLabel', 'MarketTargetLabel', 'SeasonLabel',
       'SeasonalityLabel', 'BrandId', 'UniverseLabel', 'TypeBrand',
       'ProductId', 'ProductType', 'SupplierColor', 'ProductColorId',
       'MinSize', 'MaxSize', 'CalfTurn', 'UpperHeight', 'HeelHeight',
       'PurchasePriceHT', 'IsNewCollection', 'SubtypeLabel',
       'UpperMaterialLabel', 'LiningMaterialLabel', 'OutSoleMaterialLabel',
       'RemovableSole', 'SizeAdviceDescription'],
      dtype='object')

The x_training set consists of 20 labels:
Index(['OrderNumber', 'VariantId', 'LineItem', 'CustomerId',
       'OrderStatusLabel', 'OrderTypelabel', 'Seas

In [27]:
print(type(X_train))
C = products.corr()
print((C))

<class 'pandas.core.frame.DataFrame'>
                 VariantId   BrandId  ProductId  ProductColorId   MinSize  \
VariantId         1.000000 -0.016766   0.783185        0.990998 -0.019901   
BrandId          -0.016766  1.000000   0.102005       -0.010289  0.071875   
ProductId         0.783185  0.102005   1.000000        0.791826 -0.012969   
ProductColorId    0.990998 -0.010289   0.791826        1.000000 -0.018809   
MinSize          -0.019901  0.071875  -0.012969       -0.018809  1.000000   
MaxSize          -0.019533  0.071157  -0.013154       -0.018699  0.996423   
CalfTurn          0.308736  0.173388   0.611093        0.314068  0.013149   
UpperHeight       0.090000  0.119543   0.224235        0.092973  0.024932   
HeelHeight       -0.086091  0.178323   0.012220       -0.081672  0.109033   
PurchasePriceHT  -0.036490 -0.031441  -0.008504       -0.034820  0.078680   
IsNewCollection   0.404488 -0.003701   0.316906        0.389347 -0.015657   

                  MaxSize  CalfTurn  

# Defining a feature transformation

In [24]:
features = ["OrderNumber", "VariantId", "LineItem", "CustomerId",
       "OrderStatusLabel", "OrderTypelabel", "SeasonLabel",
       "PayementModeLabel", "CustomerTypeLabel", "IsoCode", "DeviceTypeLabel",
       "PricingTypeLabel", "TotalLineItems", "Quantity", "UnitPMPEUR",
       "OrderCreationDate", "OrderShipDate", "OrderNumCustomer", "IsOnSale",
       "BillingPostalCode"]

In [7]:
F = ["OrderCreationDate","OrderNumber","VariantId", "CustomerId","OrderCreationDate","OrderShipDate","BillingPostalCode"]

In [None]:
def funk_mask(d,masked_features):
    " Defining a simple mask over the input data "
    columns_ext = masked_features
    X1 = d.loc[:,[xx for xx in d.columns if xx not in columns_ext]]
    g = lambda x: x.replace(",",".")
    X1.UnitPMPEUR = map(np.float64,(map(g,X1.UnitPMPEUR)))
    columns2bin = [x for x in X1.columns if X1[x].dtype == np.dtype('O')]
    X2 = pd.get_dummies(X1.loc[:,columns2bin])
    X1 = X1.loc[:,[xx for xx in X1.columns if xx not in columns2bin]]
    res = pd.concat([X1,X2],axis=1)
    res = res.fillna(0)
    return(res)

# Applying the mask

In [8]:
x1 = funk_mask(X_train,F)
x2 = funk_mask(X_test,F)
seleckt_columns = np.intersect1d(x1.columns,x2.columns)
x1 = x1.loc[:,seleckt_columns]
x2 = x2.loc[:,seleckt_columns]

# Supervised learning : Logistic regression model

In [13]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(x1.iloc[:50000], y_train.ReturnQuantityBin[:50000])
y_tosubmit = clf.predict_proba(x2.loc[:,x1.columns])
score_forest = np.mean(cross_val_score(clf,x1.iloc[:50000], y_train.ReturnQuantityBin[:50000],cv=10))
print(score_forest)

0.780679934205


# Score of our prediction : on the train

In [66]:
yres = clf.predict_proba(x1.loc[:100000,x1.columns])
roc_auc_score(y_train.ReturnQuantityBin.iloc[:100001],yres[:,1])

# Submission to the system
np.savetxt('y_pred.txt', y_tosubmit[:,1], fmt='%f')