In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, RobustScaler, Normalizer, LabelEncoder, LabelBinarizer
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.ensemble  import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

data_training = pd.read_csv('input_train.csv')
label = pd.read_csv('challenge_output_data_training_file_prediction_of_transaction_claims_status.csv', ';')
data_test = pd.read_csv('input_test.csv')
all_data_training = data_training.merge(label, left_on='ID', right_on='ID', how='inner').copy()
claim_order = sorted(all_data_training['CLAIM_TYPE'].unique())


# To improve the Unbalanced all_data_trainingset we can delete the missing values for CLAIM_TYPE= '-'
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['BUYER_BIRTHDAY_DATE'].isna()) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['SELLER_SCORE_AVERAGE'].isna()) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['SHIPPING_MODE'].isna()) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['SHIPPING_PRICE'].isna()) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['PRICECLUB_STATUS'].isna()) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['CLAIM_TYPE']=='-') & (all_data_training['SELLER_SCORE_COUNT'].isna()) ].index)

# Filter Unwanted Outliers
all_data_training = all_data_training.drop(all_data_training[(all_data_training['BUYER_BIRTHDAY_DATE']<1940) & (all_data_training['BUYER_BIRTHDAY_DATE']>=0)].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['BUYER_BIRTHDAY_DATE']>=2000) ].index)
all_data_training = all_data_training.drop(all_data_training[(all_data_training['SELLER_SCORE_AVERAGE']<40) & (all_data_training['SELLER_SCORE_AVERAGE']>=0)].index)

#Encoding
factor = pd.factorize(all_data_training['CLAIM_TYPE'])
target_number=factor[0]
target_label = factor[1]
target_vectoriser=np.vectorize(dict(zip(range(8),target_label)).get)
lb=LabelBinarizer()
all_data_training = all_data_training.drop(["CLAIM_TYPE"], axis=1)

  from numpy.core.umath_tests import inner1d


# PIPELINE

In [2]:
def pre_traitement(data):

    #Fixing Structural Errors
    temp = data['BUYING_DATE'].str.split('/')
    data['BUYING_DATE'] = temp.str[1] + temp.str[0].str.zfill(2)
    data['BUYING_DATE'] = data['BUYING_DATE'].astype('int32')

    # Numerical feature 
    data['REGISTRATION_DATE']=data['REGISTRATION_DATE'].fillna(-1).values
    data['BUYER_BIRTHDAY_DATE']=data['BUYER_BIRTHDAY_DATE'].fillna(-1).values
    data['BUYING_DATE']=data['BUYING_DATE'].fillna(-1).values
    data['SELLER_SCORE_AVERAGE']=data['SELLER_SCORE_AVERAGE'].fillna(-1).values 

    # WARRANTIES_PRICE: There is not missing value but NON_WARRANTIES
    data['WARRANTIES_PRICE']=data['WARRANTIES_PRICE'].fillna('NON_WARRANTIES').values 

    # Categorical Features
    data = data.fillna('MISSING')

    # Data Type Conversion
    data['BUYER_BIRTHDAY_DATE']=data['BUYER_BIRTHDAY_DATE'].astype(np.int16)
    data['SELLER_SCORE_AVERAGE']=data['SELLER_SCORE_AVERAGE'].astype(np.int16)
    data['REGISTRATION_DATE']=data['REGISTRATION_DATE'].astype(np.int16)
    data['BUYER_DEPARTMENT']=data['BUYER_DEPARTMENT'].astype(np.int16)
    data['SELLER_DEPARTMENT']=data['SELLER_DEPARTMENT'].astype(np.int16)
    data['CARD_PAYMENT']=data['CARD_PAYMENT'].astype(np.bool)
    data['COUPON_PAYMENT']=data['COUPON_PAYMENT'].astype(np.bool)
    data['RSP_PAYMENT']=data['RSP_PAYMENT'].astype(np.bool)
    data['WALLET_PAYMENT']=data['WALLET_PAYMENT'].astype(np.bool)

    #Sparse Classes
    top10=data['SELLER_COUNTRY'].value_counts().head(10).index
    data.loc[data['SELLER_COUNTRY'].isin(top10)==False,'SELLER_COUNTRY'] = 'OTHERS'

    data.loc[data['SELLER_DEPARTMENT']>98,'SELLER_DEPARTMENT'] = 'OTHERS'
    data.loc[data['BUYER_DEPARTMENT']>97,'BUYER_DEPARTMENT'] = 'OTHERS'
    
    top5=data['PRODUCT_TYPE'].value_counts().head(5).index
    data.loc[data['PRODUCT_TYPE'].isin(top5)==False,'PRODUCT_TYPE'] = 'OTHERS'

    ####### Nominal #######
    data = pd.concat([data,
                                    pd.get_dummies(data['SHIPPING_MODE'], prefix='SHIPPING_MODE'),
                                    pd.get_dummies(data['SELLER_COUNTRY'], prefix='SELLER_COUNTRY'),
                                    pd.get_dummies(data['PRODUCT_TYPE'], prefix='PRODUCT_TYPE'),
                                    pd.get_dummies(data['PRODUCT_FAMILY'], prefix='PRODUCT_FAMILY'),
                                    pd.get_dummies(data['BUYER_DEPARTMENT'], prefix='BUYER_DEPARTMENT'),
                                    pd.get_dummies(data['SELLER_DEPARTMENT'], prefix='SELLER_DEPARTMENT')
                               ],
                              axis=1)

    #######Ordinal #######
    data['PRICECLUB_STATUS'] = data['PRICECLUB_STATUS'].map( {'MISSING':0,'UNSUBSCRIBED':1, 'REGULAR': 2, 'SILVER':3, 'GOLD':4, 'PLATINUM':5}).astype(np.int16)
    data['SHIPPING_PRICE'] = data['SHIPPING_PRICE'].map( {'MISSING':0,'<1':1, '1<5': 2, '5<10':3, '10<20':4, '>20':5}).astype(np.int16)
    data['WARRANTIES_PRICE'] = data['WARRANTIES_PRICE'].map( {'NON_WARRANTIES':0, '<5':1, '5<20': 2, '20<50':3, '50<100':4, '100<500':5}).astype(np.int16)
    data['PURCHASE_COUNT'] = data['PURCHASE_COUNT'].map( {'<5':1, '5<20': 2, '20<50':3, '50<100':4, '100<500':5, '>500':6}).astype(np.int16)
    data['SELLER_SCORE_COUNT'] = data['SELLER_SCORE_COUNT'].map( {'MISSING':0,'<100':1, '100<1000': 2, '1000<10000':3, '10000<100000':4, '100000<1000000':5}).astype(np.int16)
    data['ITEM_PRICE'] = data['ITEM_PRICE'].map( {'<10':0, '10<20': 1, '20<50':2, '50<100':3, '100<500':4, '500<1000':5, '1000<5000':6, '>5000':7}).astype(np.int16)

    #Remove Unused Features
    # REMOVAL OF UNUSEFUL FEATURES
    data = data.drop(["ID"], axis=1)
    data = data.drop(['WARRANTIES_FLG'],  axis=1)

    # REMOVAL OF FEATURES THAT WERE ENCODED
    data = data.drop(['SHIPPING_MODE'],  axis=1)
    data = data.drop(['SELLER_COUNTRY'],  axis=1)
    data = data.drop(['PRODUCT_FAMILY'],  axis=1)
    data = data.drop(['PRODUCT_TYPE'],  axis=1)
    data = data.drop(['BUYER_DEPARTMENT'],  axis=1)
    data = data.drop(['SELLER_DEPARTMENT'],  axis=1)
    
    return data

# Machine Learning

In [3]:
def run_classifier(classifier, data_training, data_validate):
    
    X_train, X_test, y_train, y_test = train_test_split(data_training.values, target_number, test_size=0.20, random_state=0, stratify=target_number)
    w=compute_sample_weight(class_weight='balanced', y=y_train) 
    imputer = Imputer(strategy='mean', missing_values=-1) 

    #fill of NaN
    imputer.fit(X_train)
    X_train_imputed = imputer.transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    
    classifier.fit(X_train_imputed, y_train, sample_weight=w)
    y_train_predicted = classifier.predict(X_train_imputed)
    y_test_predicted = classifier.predict(X_test_imputed)

    y_train_predicted_label = target_vectoriser(y_train_predicted)
    y_test_predicted_label = target_vectoriser(y_test_predicted)
    y_train_label = target_vectoriser(y_train)
    y_test_label = target_vectoriser(y_test)
    print('TRAIN')
    print('f1_score:      ',metrics.f1_score(y_train, y_train_predicted, average='weighted'), '\nroc_auc_score: ', metrics.roc_auc_score(lb.fit_transform(y_train), lb.fit_transform(y_train_predicted), average='weighted'), '\n')
    print(classification_report(y_train_label, y_train_predicted_label))
    print('TEST')
    print('f1_score:      ',metrics.f1_score(y_test, y_test_predicted, average='weighted'), '\nroc_auc_score: ', metrics.roc_auc_score(lb.fit_transform(y_test), lb.fit_transform(y_test_predicted), average='weighted'), '\n')
    print(classification_report(y_test_label, y_test_predicted_label))

    X_validate_imputed = imputer.transform(data_validate)
    score = classifier.predict(X_validate_imputed)
    df =  pd.DataFrame( columns=['ID', 'CLAIM_TYPE'])
    df['ID']=data_test['ID']
    df['CLAIM_TYPE']=target_vectoriser(score)

    return df
    

In [4]:
data_t = pre_traitement(all_data_training)
data_v = pre_traitement(data_test)

In [5]:
%%time
classifier = RandomForestClassifier(n_estimators=200,max_depth=50, class_weight="balanced_subsample", random_state=0)
df=run_classifier(classifier, data_t, data_v)
df.to_csv("data_v_RF.csv", index=False, sep=";")

TRAIN
f1_score:       0.9822475378248254 
roc_auc_score:  0.9857010783109489 

                          precision    recall  f1-score   support

                       -       1.00      0.99      1.00     12233
                 DAMAGED       0.99      0.96      0.98      4638
               DIFFERENT       0.99      0.97      0.98      3342
                    FAKE       0.07      1.00      0.14        98
            NOT_RECEIVED       1.00      0.95      0.97     11321
SELLER_CANCEL_POSTERIORI       1.00      0.98      0.99     10791
               UNDEFINED       0.98      0.97      0.97      3159
              WITHDRAWAL       1.00      0.97      0.98      5462

             avg / total       0.99      0.97      0.98     51044

TEST
f1_score:       0.47115683277337045 
roc_auc_score:  0.6874489982591161 

                          precision    recall  f1-score   support

                       -       0.59      0.87      0.70      3059
                 DAMAGED       0.36      0.20 

In [6]:
%%time
classifier =  LogisticRegression(solver='lbfgs', multi_class='multinomial')
df=run_classifier(classifier, data_t, data_v)
df.to_csv("data_v_LR.csv", index=False, sep=";")

TRAIN
f1_score:       0.14853978578108878 
roc_auc_score:  0.5255049045825522 

                          precision    recall  f1-score   support

                       -       0.32      0.53      0.40     12233
                 DAMAGED       0.10      0.02      0.03      4638
               DIFFERENT       0.11      0.06      0.08      3342
                    FAKE       0.00      0.56      0.01        98
            NOT_RECEIVED       0.24      0.04      0.07     11321
SELLER_CANCEL_POSTERIORI       0.23      0.05      0.08     10791
               UNDEFINED       0.08      0.07      0.07      3159
              WITHDRAWAL       0.17      0.04      0.06      5462

             avg / total       0.22      0.16      0.15     51044

TEST
f1_score:       0.14104471719014652 
roc_auc_score:  0.5208552036561523 

                          precision    recall  f1-score   support

                       -       0.31      0.52      0.39      3059
                 DAMAGED       0.11      0.02