# Electricity and Gas Consumption - Fraud Detection
# By Mohamed Eltayeb

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.rcParams["figure.figsize"] = (12, 8)
pd.set_option('display.max_columns', None)

# Utils

In [None]:
#Plot the Features Importances
def plotImp(model, X , num = 30, fig_size = (60, 30)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('Catboost Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('cb_importances-01.png')
    plt.show()
    return feature_imp.sort_values(by="Value",ascending=False)

In [None]:
#Reduce Memory Usage
def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    
    return df

# Read the training and testing data


In [None]:
train_df = pd.read_csv("client_train.csv",low_memory=False)
test_df = pd.read_csv("client_test.csv",low_memory=False)
invoice_train = pd.read_csv("invoice_train.csv",low_memory=False)
invoice_test = pd.read_csv("invoice_test.csv",low_memory=False)

# Add Time-related Features

In [None]:
invoice_train = invoice_train.sort_values(['client_id','invoice_date']).reset_index(drop=True) 
invoice_test = invoice_test.sort_values(['client_id','invoice_date']).reset_index(drop=True)

invoice_train['invoice_date'] = invoice_train['invoice_date'].astype('datetime64[ns]')
invoice_test['invoice_date'] = invoice_test['invoice_date'].astype('datetime64[ns]')
train_df['creation_date'] = train_df['creation_date'].astype('datetime64[ns]')
test_df['creation_date'] = test_df['creation_date'].astype('datetime64[ns]')

for dataset in (invoice_train,invoice_test):
        Date = 'invoice_date'
        dataset[f'{Date}_Date_Int'] = dataset[Date].astype(np.int64) * 1e-9
        dataset[f'{Date}_Day'] = dataset[Date].dt.day
        dataset[f'{Date}_Month'] = dataset[Date].dt.month
        dataset[f'{Date}_Year'] = dataset[Date].dt.year         
        dataset.drop(Date,inplace=True,axis=1)
        
for dataset in (train_df,test_df):
        Date = 'creation_date'
        dataset[f'{Date}_Date_Int'] = dataset[Date].astype(np.int64) * 1e-9
        dataset[f'{Date}_Day'] = dataset[Date].dt.day
        dataset[f'{Date}_Month'] = dataset[Date].dt.month
        dataset[f'{Date}_Year'] = dataset[Date].dt.year         
        dataset.drop(Date,inplace=True,axis=1)
    
invoice_train = reduce_memory_usage(invoice_train)
invoice_test = reduce_memory_usage(invoice_test)

## Correct Some errors in the Data

In [None]:
invoice_train['counter_statue'] = invoice_train['counter_statue'].map({0:0,1:1,2:2,3:3,4:4,5:5,769:5,'0':0,'5':5,'1':1,'4':4,'A':0,618:5,269375:5,46:5,420:5})
for dataset in [invoice_train,invoice_test]:
    dataset['counter_statue'] = dataset['counter_statue'].astype(str)
train_df['target'] = train_df['target'].astype(int)

## Store The IDs

In [None]:
ID = test_df['client_id']

# Feature Engineering

## Store Features Name to be Used For Aggregations Later

In [None]:
Aggs_based = ['client_id']
Aggs_num = ['consommation_level_1','consommation_level_2','consommation_level_3',
            'consommation_level_4','months_number']
Aggs_cat = ['reading_remarque','counter_coefficient','tarif_type',
            'counter_number','counter_statue','counter_code',
            'old_index','new_index','counter_type','invoice_date_Date_Int',
            'invoice_date_Day','invoice_date_Month','invoice_date_Year']

## Difference Between New_index and Old_index

In [None]:
for dataset in [invoice_train,invoice_test]:
    dataset['NewDiffOld'] = dataset['new_index'] - dataset['old_index']
Aggs_cat += ['NewDiffOld']

## Regions Bins

In [None]:
for dataset in [train_df,test_df]:
    dataset['region_bins'] = dataset['region'].apply(lambda x: 1 if x<=100 else 3 if x>=300 else 2)

## Months since the account created

In [None]:
train_df['MonthSinceAccounCreationt'] = (2022 - train_df['creation_date_Year'])*12 - train_df['creation_date_Month']
test_df['MonthSinceAccounCreation'] = (2022 - test_df['creation_date_Year'])*12 - test_df['creation_date_Month']

## Interactions

In [None]:
invoice_train['counter_code_number_add'] = invoice_train['counter_code'] + invoice_train['counter_number'] 
invoice_train['counter_code_number_sub'] = invoice_train['counter_code'] - invoice_train['counter_number'] 
invoice_train['counter_code_number_prod'] = invoice_train['counter_code'] * invoice_train['counter_number'] 
invoice_train['counter_code_number_div'] = invoice_train['counter_code'] / invoice_train['counter_number'] 

invoice_test['counter_code_number_add'] = invoice_test['counter_code'] + invoice_test['counter_number'] 
invoice_test['counter_code_number_sub'] = invoice_test['counter_code'] - invoice_test['counter_number'] 
invoice_test['counter_code_number_prod'] = invoice_test['counter_code'] * invoice_test['counter_number'] 
invoice_test['counter_code_number_div'] = invoice_test['counter_code'] / invoice_test['counter_number'] 

Aggs_num += ['counter_code_number_add','counter_code_number_sub',
             'counter_code_number_prod','counter_code_number_div']

## Label Encoding for Some of the Categorical Features

In [None]:
le = LabelEncoder()
df = pd.concat([invoice_train, invoice_test])
for f in (Aggs_cat):
    le.fit(df[f])
    invoice_train[f] = le.transform(invoice_train[f])
    invoice_test[f] = le.transform(invoice_test[f])

## Add Aggregations (Numerical)

In [None]:
def Agg(Feature):
    for client in (train_df,test_df):
        dataset = invoice_train if client.equals(train_df) else invoice_test
        for feat_1 in Aggs_based:
            client[f'{Feature}_Agg_{feat_1}_mean'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].mean()))
            client[f'{Feature}_Agg_{feat_1}_median'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].median()))
            client[f'{Feature}_Agg_{feat_1}_std'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].std()))
            client[f'{Feature}_Agg_{feat_1}_min'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].min()))
            client[f'{Feature}_Agg_{feat_1}_max'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].max()))
            client[f'{Feature}_Agg_{feat_1}_sum'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].sum()))
            client[f'{Feature}_Agg_{feat_1}_range'] = client[f'{Feature}_Agg_{feat_1}_max'] - client[f'{Feature}_Agg_{feat_1}_min']                
for feat in tqdm(Aggs_num + Aggs_cat):         
    Agg(feat)      

## Add Aggregations (Categorical)

In [None]:
def Agg(Feature):
    for client in (train_df,test_df):
        dataset = invoice_train if client.equals(train_df) else invoice_test
        for feat_1 in Aggs_based:
            client[f'{Feature}_Agg_{feat_1}_mode'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].agg(lambda x: pd.Series.mode(x)[0])))
            client[f'{Feature}_Agg_{feat_1}_nunique'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].nunique()))
                
for feat in tqdm(Aggs_cat):         
    Agg(feat)

## Drop Client_id

In [None]:
for dataset in [train_df,test_df]:
    dataset.drop('client_id',inplace=True,axis=1)

# Encoding

##### Label Encoding 

In [None]:
feats = list(train_df.select_dtypes(include=['object','category']).columns)
le = LabelEncoder()
df = pd.concat([train_df, test_df])
for f in feats:
    print(f)
    le.fit(df[f])
    train_df[f] = le.transform(train_df[f])
    test_df[f] = le.transform(test_df[f])

##### One-Hot Encoding

In [None]:
feats = ['region','disrict']
df = pd.concat([train_df,test_df])
for feat in feats:
    Names = [f'{feat}_{x}' for x in df[feat].value_counts().keys().sort_values()]
    OHE_cols = pd.DataFrame(pd.get_dummies(df[feat]).values,index = df.index, columns = Names)
    df = pd.concat([df,OHE_cols],axis=1)
    
train_df = df[:train_df.shape[0]]
test_df = df[train_df.shape[0]:]
test_df.drop('target',inplace=True,axis=1)

## Dropping Duplicates and Constants Features

In [None]:
print('Features Before Dropping: ', train_df.shape)
#Drop Duplicate Features
cols = train_df.columns
dup = []
for feat_1 in tqdm(cols):
    if (feat_1 in dup):
        continue
    for feat_2 in cols.drop(feat_1):
        if (feat_2 in dup):
            continue
        if (train_df[feat_1].equals(train_df[feat_2])):
            train_df.drop(feat_2,inplace=True,axis=1)
            test_df.drop(feat_2,inplace=True,axis=1)
            dup.append(feat_2)

#Drop Constant Features
for feat in tqdm(test_df.columns):
    if ((len(train_df[feat].value_counts().keys()) == 1) | (len(test_df[feat].value_counts().keys()) == 1)):
        train_df.drop(feat,inplace=True,axis=1)
        test_df.drop(feat,inplace=True,axis=1)
        
print('Features After Dropping: ', train_df.shape)

## Missing Values

In [None]:
train_df = train_df.fillna(train_df.median())
test_df = test_df.fillna(test_df.median())

# Modeling

In [None]:
cb_params = {'depth': 8, 'iterations': 5000, 'learning_rate': 0.0164391346853785,
             'task_type':'GPU','reg_lambda':21.97780539780917,'verbose':0}
cb = CatBoostClassifier(**cb_params, random_state=42)

## Validation:

In [None]:
print('Validating...')

X = train_df.drop('target',axis=1).values
y = train_df['target'].values

scores = []                  
for fold, (train_index, test_index) in enumerate(StratifiedKFold(n_splits=5).split(X, y)):
    X_Train, X_Test = X[train_index], X[test_index]
    y_Train, y_Test = y[train_index], y[test_index]
    cb.fit(X_Train,y_Train)
    y_pred = cb.predict_proba(X_Test)[:,1]
    scores.append(roc_auc_score(y_Test,y_pred))
    print(scores[-1])

print("\nMean:",np.mean(scores),"\nSTD: ", np.std(scores))

## Show the Features Importances 

In [None]:
imps = plotImp(cb,train_df.drop('target',axis=1))

In [None]:
#Drop Features with 0 importance
useless_features = imps[imps['Value'] == 0]['Feature'].values
train_df.drop(useless_features,inplace=True,axis=1)
test_df.drop(useless_features,inplace=True,axis=1)

## Inference

In [None]:
X = train_df.drop('target',axis=1)
y = train_df['target']

cb.fit(X,y)
test_df['target'] = cb.predict_proba(test_df)[:,1]

submission = pd.DataFrame({"ID": ID ,"Target": test_df.target.values})
submission.to_csv('FraudDetectionSubmission.csv',index=False)