# **Task 3 - Feature Engineering**

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Add the parent directory to the system path 
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

# Importing modules from scripts 

In [4]:
from scripts.feature_engineering import *
from scripts.logger import setup_logger

In [5]:

#logger
logger = setup_logger('feature_engineering_logger', '../logs/FE.log')

# Load the data

In [6]:
data = pd.read_csv('../data/data.csv')
logger.info("Data loaded successfully.")
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   TransactionId         95662 non-null  object             
 1   BatchId               95662 non-null  object             
 2   AccountId             95662 non-null  object             
 3   SubscriptionId        95662 non-null  object             
 4   CustomerId            95662 non-null  object             
 5   CurrencyCode          95662 non-null  object             
 6   CountryCode           95662 non-null  int64              
 7   ProviderId            95662 non-null  object             
 8   ProductId             95662 non-null  object             
 9   ProductCategory       95662 non-null  object             
 10  ChannelId             95662 non-null  object             
 11  Amount                95662 non-null  float64            
 12  Valu

In [9]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


# Summary statistics

In [11]:
data.describe()

Unnamed: 0,CountryCode,Amount,Value,PricingStrategy,FraudResult
count,95662.0,95662.0,95662.0,95662.0,95662.0
mean,256.0,6717.846,9900.584,2.255974,0.002018
std,0.0,123306.8,123122.1,0.732924,0.044872
min,256.0,-1000000.0,2.0,0.0,0.0
25%,256.0,-50.0,275.0,2.0,0.0
50%,256.0,1000.0,1000.0,2.0,0.0
75%,256.0,2800.0,5000.0,2.0,0.0
max,256.0,9880000.0,9880000.0,4.0,1.0


# **1.** Aggregate Features

In [12]:
data = create_aggregate_features(data)
print("\nAfter creating aggregate features:")
print(data[['CustomerId', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionCount', 'StdDevTransactionAmount']].head())
logger.info("creating aggregate features on Amount")


After creating aggregate features:
        CustomerId  TotalTransactionAmount  AverageTransactionAmount  \
0  CustomerId_4406               109921.75                923.712185   
1  CustomerId_4406               109921.75                923.712185   
2  CustomerId_4683                 1000.00                500.000000   
3   CustomerId_988               228727.20               6019.136842   
4   CustomerId_988               228727.20               6019.136842   

   TransactionCount  StdDevTransactionAmount  
0               119              3042.294251  
1               119              3042.294251  
2                 2                 0.000000  
3                38             17169.241610  
4                38             17169.241610  


# **2.** Extract Features

In [13]:
data = extract_time_features(data)
print("\nAfter extracting time features:")
print(data[['CustomerId', 'TransactionStartTime', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']].head())
logger.info("extracted features from TransactionStartTime ")


After extracting time features:
        CustomerId      TransactionStartTime  TransactionHour  TransactionDay  \
0  CustomerId_4406 2018-11-15 02:18:49+00:00                2              15   
1  CustomerId_4406 2018-11-15 02:19:08+00:00                2              15   
2  CustomerId_4683 2018-11-15 02:44:21+00:00                2              15   
3   CustomerId_988 2018-11-15 03:32:55+00:00                3              15   
4   CustomerId_988 2018-11-15 03:34:21+00:00                3              15   

   TransactionMonth  TransactionYear  
0                11             2018  
1                11             2018  
2                11             2018  
3                11             2018  
4                11             2018  


In [14]:
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TransactionCount,StdDevTransactionAmount,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,2,0,1000.0,500.0,2,0.0,2,15,11,2018
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,2,0,228727.2,6019.136842,38,17169.24161,3,15,11,2018
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,228727.2,6019.136842,38,17169.24161,3,15,11,2018


In [16]:
data.shape

(95662, 24)

# **3.** Encode Categorical Variables

In [17]:
categorical_columns = ['ProductCategory', 'ChannelId', 'PricingStrategy','CustomerId','ProviderId']
logger.info("categorical features are encoded using onehot encoder")
print("\nEncoding categorical features...")
df_encoded = encode_categorical_features(data, categorical_columns)
df_encoded.head()


Encoding categorical features...


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CurrencyCode,CountryCode,ProductId,Amount,Value,TransactionStartTime,...,CustomerId_CustomerId_993,CustomerId_CustomerId_994,CustomerId_CustomerId_996,CustomerId_CustomerId_998,ProviderId_ProviderId_1,ProviderId_ProviderId_2,ProviderId_ProviderId_3,ProviderId_ProviderId_4,ProviderId_ProviderId_5,ProviderId_ProviderId_6
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,UGX,256,ProductId_10,1000.0,1000,2018-11-15 02:18:49+00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,UGX,256,ProductId_6,-20.0,20,2018-11-15 02:19:08+00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,UGX,256,ProductId_1,500.0,500,2018-11-15 02:44:21+00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,UGX,256,ProductId_21,20000.0,21800,2018-11-15 03:32:55+00:00,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,UGX,256,ProductId_6,-644.0,644,2018-11-15 03:34:21+00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
