In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, SMOTENC
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer, biningTransformer, weekdayTransformer, TotalTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [21]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [22]:
train = train.sample(frac=1).reset_index(drop=True)
train =train.sort_values(by=['CustomerId', "TransactionStartTime"])
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)

test = test.sort_values(by=['CustomerId', "TransactionStartTime"])

StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

drop_cols = ["CurrencyCode", "BatchId", "CountryCode", "CustomerId", "PricingStrategy", "Amount"]
hot_cols = ["ProductCategory"]
bin_cols = ["TransactionStartTime"]
smt  = SMOTE()

In [23]:
preprocessor = imbPipeline(steps = [
    ("clean", StringCleanTransformer()),
    ("amout to sign", SignTransformer()),
    ("total", TotalTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("Dropper", DropperTransformer(drop_cols)),
    ("One hot encoding", OHTransformer(hot_cols)),
    # ("binning", biningTransformer(bin_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model", XGBClassifier(n_estimators = 500))
])

preprocessor.fit(train, train_Y)

train.head()
test_res = preprocessor.predict(test)

output = pd.DataFrame()
output["TransactionId"] = test["TransactionId"]
output["FraudResult"] = test_res
print(output.head(20))

#save the result to csv file
output.to_csv("submission.csv", index=False)

['AccountId' 'SubscriptionId' 'ProviderId' 'ProductId' 'ChannelId' 'Value'
 'TransactionStartTime' 'Sign' 'total' 'TransactionStartDay'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8']
['AccountId' 'SubscriptionId' 'ProviderId' 'ProductId' 'ChannelId' 'Value'
 'TransactionStartTime' 'Sign' 'total' 'TransactionStartDay'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8']
['AccountId' 'SubscriptionId' 'ProviderId' 'ProductId' 'ChannelId' 'Value'
 'TransactionStartTime' 'Sign' 'total' 'TransactionStartDay'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8']
['AccountId' 'SubscriptionId' 'ProviderId' 'ProductId' 'ChannelId' 

In [24]:
import pandas as pd

# Create example DataFrame
df = pd.DataFrame({
    'customer_id': [1, 1, 2, 2, 3, 3],
    'amount': [10, 20, 5, 15, 25, 30],
    'date': ['2022-01-01', '2022-01-02', '2022-01-01', '2022-01-03', '2022-01-02', '2022-01-04']
})

# Convert date column to datetime type
df['date'] = pd.to_datetime(df['date'])

df =df.sort_values(by=['customer_id', 'date'])
df['total'] = 0
# df['total'] = df.total.apply(lambda x: 10 )
for id in df.customer_id.unique():
    df.loc[df.customer_id == id, 'total'] = df.loc[df.customer_id == id, 'amount'].cumsum()
# df = df[df['amount']==5]
df.head()


Unnamed: 0,customer_id,amount,date,total
0,1,10,2022-01-01,10
1,1,20,2022-01-02,30
2,2,5,2022-01-01,5
3,2,15,2022-01-03,20
4,3,25,2022-01-02,25
