In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

def load_data(file_path):
    return pd.read_csv(file_path)

file_path = 'C:/Users/nejat/AIM Projects/week6 data/data.csv'
df = load_data(file_path)
print(df.head())  

         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   Amount  Value  TransactionStart

In [7]:
def create_aggregate_features(df):
    df['total_transaction_amount'] = df.groupby('CustomerId')['Amount'].transform('sum')
    df['average_transaction_amount'] = df.groupby('CustomerId')['Amount'].transform('mean')
    df['transaction_count'] = df.groupby('CustomerId')['TransactionId'].transform('count')
    df['std_transaction_amount'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)
    return df

df = create_aggregate_features(df)
print("Aggregate Features Added:")
print(df[['CustomerId', 'total_transaction_amount', 'average_transaction_amount', 'transaction_count', 'std_transaction_amount']].head())


Aggregate Features Added:
        CustomerId  total_transaction_amount  average_transaction_amount  \
0  CustomerId_4406                 109921.75                  923.712185   
1  CustomerId_4406                 109921.75                  923.712185   
2  CustomerId_4683                   1000.00                  500.000000   
3   CustomerId_988                 228727.20                 6019.136842   
4   CustomerId_988                 228727.20                 6019.136842   

   transaction_count  std_transaction_amount  
0                119             3042.294251  
1                119             3042.294251  
2                  2                0.000000  
3                 38            17169.241610  
4                 38            17169.241610  


In [8]:
def extract_date_features(df):
    df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
    
    df['transaction_hour'] = df['TransactionStartTime'].dt.hour
    df['transaction_day'] = df['TransactionStartTime'].dt.day
    df['transaction_month'] = df['TransactionStartTime'].dt.month
    df['transaction_year'] = df['TransactionStartTime'].dt.year
    return df

df = extract_date_features(df)
print("\nDate Features Extracted:")
print(df[['TransactionStartTime', 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year']].head())



Date Features Extracted:
       TransactionStartTime  transaction_hour  transaction_day  \
0 2018-11-15 02:18:49+00:00                 2               15   
1 2018-11-15 02:19:08+00:00                 2               15   
2 2018-11-15 02:44:21+00:00                 2               15   
3 2018-11-15 03:32:55+00:00                 3               15   
4 2018-11-15 03:34:21+00:00                 3               15   

   transaction_month  transaction_year  
0                 11              2018  
1                 11              2018  
2                 11              2018  
3                 11              2018  
4                 11              2018  


In [11]:
def encode_categorical_features(df, categorical_cols):
    missing_cols = [col for col in categorical_cols if col not in df.columns]
    if missing_cols:
        print(f"The following columns are missing: {missing_cols}")
        return df
    
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    return df

categorical_columns = ['ProductCategory', 'CurrencyCode', 'ProviderId']

df = encode_categorical_features(df, categorical_columns)

print("\nCategorical Features Encoded:")
print(df.head())


Categorical Features Encoded:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId  CountryCode     ProductId    ChannelId   Amount  Value  \
0  CustomerId_4406          256  ProductId_10  ChannelId_3   1000.0   1000   
1  CustomerId_4406          256   ProductId_6  ChannelId_2    -20.0     20   
2  CustomerId_4683          256   ProductId_1  ChannelId_3    500.0    500   
3   CustomerId_988          256  ProductId_21  ChannelId_3  20000.0  21800   
4   CustomerId_988          256   ProductId_6  ChannelId_2   -644.0    644   

   ... ProductCategory_ot

In [10]:
print("Available columns:", df.columns)



Available columns: Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'total_transaction_amount', 'average_transaction_amount',
       'transaction_count', 'std_transaction_amount', 'transaction_hour',
       'transaction_day', 'transaction_month', 'transaction_year'],
      dtype='object')


In [13]:
def handle_missing_values(df, strategy_numeric='mean', strategy_categorical='most_frequent'):
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    datetime_cols = df.select_dtypes(include=['datetime']).columns
    
    if len(numeric_cols) > 0:
        numeric_imputer = SimpleImputer(strategy=strategy_numeric)
        df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
    
    if len(categorical_cols) > 0:
        categorical_imputer = SimpleImputer(strategy=strategy_categorical)
        df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

    for col in datetime_cols:
        df[col].fillna(method='ffill', inplace=True)  # Forward fill as an example

    return df

df = handle_missing_values(df, strategy_numeric='mean', strategy_categorical='most_frequent')

print("\nMissing Values Handled:")
print(df.isnull().sum()) 


Missing Values Handled:
TransactionId                         0
BatchId                               0
AccountId                             0
SubscriptionId                        0
CustomerId                            0
CountryCode                           0
ProductId                             0
ChannelId                             0
Amount                                0
Value                                 0
TransactionStartTime                  0
PricingStrategy                       0
FraudResult                           0
total_transaction_amount              0
average_transaction_amount            0
transaction_count                     0
std_transaction_amount                0
transaction_hour                      0
transaction_day                       0
transaction_month                     0
transaction_year                      0
ProductCategory_data_bundles          0
ProductCategory_financial_services    0
ProductCategory_movies                0
ProductCategory

In [14]:
def normalize_standardize_features(df, columns, method='normalize'):
    if method == 'normalize':
        scaler = MinMaxScaler()
    elif method == 'standardize':
        scaler = StandardScaler()
    else:
        raise ValueError("Method must be 'normalize' or 'standardize'")
    
    df[columns] = scaler.fit_transform(df[columns])
    return df

numeric_columns = ['Amount', 'Value', 'total_transaction_amount', 'average_transaction_amount']
df = normalize_standardize_features(df, numeric_columns, method='standardize')
print("\nNumerical Features Standardized:")
print(df[numeric_columns].head())


Numerical Features Standardized:
     Amount     Value  total_transaction_amount  average_transaction_amount
0 -0.046371 -0.072291                  0.170118                   -0.067623
1 -0.054643 -0.080251                  0.170118                   -0.067623
2 -0.050426 -0.076352                  0.165122                   -0.072568
3  0.107717  0.096648                  0.175567                   -0.008155
4 -0.059704 -0.075183                  0.175567                   -0.008155
