In [5]:
import boto3
import os

from dotenv import load_dotenv

load_dotenv()

s3_key_id = os.environ.get("S3_ID")
s3_secret = os.environ.get("S3_SECRET")
bucket_name = os.environ.get("S3_BUCKET_NAME")
bucket_name = "otus-task-n3"
source_prefix = "fraud-data/"

s3_resource = boto3.resource("s3",
                            aws_access_key_id=s3_key_id,
                            aws_secret_access_key=s3_secret,
                            endpoint_url="https://storage.yandexcloud.net")

bucket = s3_resource.Bucket(bucket_name)

for obj in bucket.objects.filter():
    if obj.key:
        print(obj.key)


2019-08-22.txt/_SUCCESS
2019-08-22.txt/part-00000-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00001-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00002-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00003-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00004-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00005-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00006-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00007-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00008-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00009-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00010-b79e910b-d0ba-4a59-b479-24cdefc9e206-c000.snappy.parquet
2019-08-22.txt/part-00011-b79e910b-d0ba-4a59-b479-24cdefc9e206-

In [71]:
import pandas as pd
import os


s3_path = 's3://otus-task-n3/2019-09-21.txt'

df = pd.read_parquet(s3_path, storage_options={
    "key": os.environ.get("S3_ID"),
    "secret": os.environ.get("S3_SECRET"),
    "client_kwargs": {"endpoint_url": "https://storage.yandexcloud.net"}
})


   tranaction_id         tx_datetime  customer_id  terminal_id  tx_amount  \
0       46988237 2019-09-21 09:45:59            1          178      83.11   
1       46988238 2019-09-21 19:33:01            2          660      22.15   
2       46988239 2019-09-21 18:06:19            3          732      36.83   
3       46988240 2019-09-21 16:56:01           10          663      19.30   
4       46988241 2019-09-21 05:34:26           10          145     106.51   

   tx_time_seconds  tx_time_days  tx_fraud  tx_fraud_scenario  
0          2627159            30         0                  0  
1          2662381            30         0                  0  
2          2657179            30         0                  0  
3          2652961            30         0                  0  
4          2612066            30         0                  0  


In [None]:
df.columns

Index(['tranaction_id', 'tx_datetime', 'customer_id', 'terminal_id',
       'tx_amount', 'tx_time_seconds', 'tx_time_days', 'tx_fraud',
       'tx_fraud_scenario'],
      dtype='object')

In [72]:
df = df[df["customer_id"] >= 0]
df = df.dropna()

In [73]:
# Assuming df is your DataFrame
df['tx_datetime'] = pd.to_datetime(df['tx_datetime'])

# Time-based features
df['tx_hour'] = df['tx_datetime'].dt.hour
df['is_weekend'] = df['tx_datetime'].dt.dayofweek >= 5

# Sort by customer and transaction datetime
df = df.sort_values(by=['customer_id', 'tx_datetime'])

# Customer behavior features
df['time_since_last_tx'] = df.groupby('customer_id')['tx_datetime'].diff().dt.total_seconds()
df['avg_tx_amount_customer'] = df.groupby('customer_id')['tx_amount'].transform('mean')
df['tx_count_customer'] = df.groupby('customer_id')['tranaction_id'].transform('count')
df['var_tx_amount_customer'] = df.groupby('customer_id')['tx_amount'].transform('std')

# # Terminal-based features
df['avg_tx_amount_terminal'] = df.groupby('terminal_id')['tx_amount'].transform('mean')
df['tx_count_terminal'] = df.groupby('terminal_id')['tranaction_id'].transform('count')
df['var_tx_amount_terminal'] = df.groupby('terminal_id')['tx_amount'].transform('std')

# Aggregated features
rolling_window = '1H'  # Example: 24 hours rolling window
df.set_index('tx_datetime', inplace=True)
df['rolling_tx_amount_customer'] = df.groupby('customer_id')['tx_amount'].rolling(rolling_window).sum().reset_index(0, drop=True)
df['rolling_tx_count_customer'] = df.groupby('customer_id')['tranaction_id'].rolling(rolling_window).count().reset_index(0, drop=True)

# Reset index if needed
# df.reset_index(inplace=True)

In [74]:
from sklearn.preprocessing import StandardScaler

# Assuming X is your feature matrix containing numeric columns
numeric_columns = ['tx_amount', 'time_since_last_tx', 'avg_tx_amount_customer', 'tx_count_customer',
                   'var_tx_amount_customer', 'avg_tx_amount_terminal', 'tx_count_terminal', 'var_tx_amount_terminal']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numeric columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])


# Assuming df is your DataFrame and X is your feature matrix
categorical_columns = ['tx_hour']

# Perform one-hot encoding for categorical columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Convert boolean column to binary (1/0)
df['is_weekend'] = df['is_weekend'].astype(int)



In [75]:
df

Unnamed: 0_level_0,tranaction_id,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,is_weekend,time_since_last_tx,...,tx_hour_14,tx_hour_15,tx_hour_16,tx_hour_17,tx_hour_18,tx_hour_19,tx_hour_20,tx_hour_21,tx_hour_22,tx_hour_23
tx_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-21 09:45:59,46988237,1,178,0.698463,2627159,30,0,0,1,,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 19:33:01,46988238,2,660,-0.774481,2662381,30,0,0,1,,...,False,False,False,False,False,True,False,False,False,False
2019-09-21 18:06:19,46988239,3,732,-0.419776,2657179,30,0,0,1,,...,False,False,False,False,True,False,False,False,False,False
2019-09-21 05:34:26,46988241,10,145,1.263865,2612066,30,0,0,1,,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 16:56:01,46988240,10,663,-0.843344,2652961,30,0,0,1,1.963584,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-21 23:09:56,48556741,999996,871,0.279727,2675396,30,0,0,1,,...,False,False,False,False,False,False,False,False,False,True
2019-09-21 02:55:48,48556743,999998,817,-0.925013,2602548,30,0,0,1,,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 08:07:38,48556744,999998,639,-1.061531,2621258,30,0,0,1,0.232231,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 12:15:25,48556745,999998,0,-0.954975,2636125,30,0,0,1,-0.067683,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Customer Behavior Features
# Average Transaction Amount (Customer): Calculate the average transaction amount for each customer.
# Transaction Count (Customer): Count the number of transactions for each customer within a specific time frame (e.g., last week, last month).
# Variance of Transaction Amount (Customer): Calculate the variance or standard deviation of transaction amounts for each customer.
# Transaction Frequency (Customer): Calculate the frequency of transactions for each customer.

# Terminal-Based Features
# Average Transaction Amount (Terminal): Calculate the average transaction amount for each terminal.
# Transaction Count (Terminal): Count the number of transactions for each terminal within a specific time frame.
# Variance of Transaction Amount (Terminal): Calculate the variance or standard deviation of transaction amounts for each terminal.

# Fraud-Specific Features
# Previous Fraudulent Transactions (Customer): Count the number of previous fraudulent transactions for the customer.
# Previous Fraudulent Transactions (Terminal): Count the number of previous fraudulent transactions at the terminal.

# Aggregated Features
# Rolling Transaction Amount (Customer): Calculate the sum of transaction amounts for the customer over a rolling window (e.g., last 24 hours).
# Rolling Transaction Count (Customer): Count the number of transactions for the customer over a rolling window.

# Interaction Features
# Customer-Terminal Interaction: The combination of customer_id and terminal_id to capture specific patterns for customer-terminal pairs.

# Binary Encoded Features
# tx_fraud: The binary indicator of fraud (already provided, used as the target variable in logistic regression).
# Example Code for Feature Generation
# Here's a sample code to generate some of these features:

In [78]:
df = df.dropna()

In [98]:
df

Unnamed: 0_level_0,tranaction_id,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario,is_weekend,time_since_last_tx,...,tx_hour_14,tx_hour_15,tx_hour_16,tx_hour_17,tx_hour_18,tx_hour_19,tx_hour_20,tx_hour_21,tx_hour_22,tx_hour_23
tx_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-21 16:56:01,46988240,10,663,-0.843344,2652961,30,0,0,1,1.963584,...,False,False,True,False,False,False,False,False,False,False
2019-09-21 12:12:51,46988242,11,337,-0.005632,2635971,30,0,0,1,-0.912718,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 15:13:40,46988244,11,975,-0.618875,2646820,30,0,0,1,-0.381254,...,False,True,False,False,False,False,False,False,False,False
2019-09-21 16:47:20,46988245,12,522,0.817101,2652440,30,0,0,1,1.255122,...,False,False,True,False,False,False,False,False,False,False
2019-09-21 07:04:56,46988248,13,440,-0.263445,2617496,30,0,0,1,-1.156442,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-09-21 17:26:47,48556735,999989,717,-0.929121,2654807,30,0,0,1,-0.274025,...,False,False,False,True,False,False,False,False,False,False
2019-09-21 16:04:30,48556736,999991,717,0.714652,2649870,30,0,0,1,0.609250,...,False,False,True,False,False,False,False,False,False,False
2019-09-21 08:07:38,48556744,999998,639,-1.061531,2621258,30,0,0,1,0.232231,...,False,False,False,False,False,False,False,False,False,False
2019-09-21 12:15:25,48556745,999998,0,-0.954975,2636125,30,0,0,1,-0.067683,...,False,False,False,False,False,False,False,False,False,False


In [100]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier

# Load your dataset (assuming it's already preprocessed and features are generated)
# df = pd.read_csv('your_dataset.csv')  # Uncomment this if loading from a file

# Assuming 'df' is your DataFrame and 'tx_fraud' is the target variable
# Define feature columns (excluding target and any non-numeric columns)

feature_columns = ['tx_amount', 'is_weekend', 'time_since_last_tx', 'avg_tx_amount_customer', 'tx_count_customer',
                   'var_tx_amount_customer', 'avg_tx_amount_terminal', 'tx_count_terminal',
                   'var_tx_amount_terminal']

# Create feature matrix X and target vector y
X = df[feature_columns]
y = df['tx_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# Initialize the logistic regression model
# model = LogisticRegression(max_iter=1000)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_classifier.fit(X_train_rus, y_train_rus)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# If you want to see the coefficients of the features
# coefficients = pd.DataFrame({'Feature': feature_columns, 'Coefficient': model.coef_[0]})
# print(coefficients)


Accuracy: 0.9979496479275923
Confusion Matrix:
 [[231952    386]
 [   116  12382]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    232338
           1       0.97      0.99      0.98     12498

    accuracy                           1.00    244836
   macro avg       0.98      0.99      0.99    244836
weighted avg       1.00      1.00      1.00    244836



In [91]:
y_train_rus.value_counts()

tx_fraud
0    29142
1    29142
Name: count, dtype: int64

In [103]:
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


s3_path = 's3://otus-task-n3/2019-09-21.txt'

df = pd.read_parquet(s3_path, storage_options={
    "key": os.environ.get("S3_ID"),
    "secret": os.environ.get("S3_SECRET"),
    "client_kwargs": {"endpoint_url": "https://storage.yandexcloud.net"}
})


# Assuming df is your DataFrame
df['tx_datetime'] = pd.to_datetime(df['tx_datetime'])

# Time-based features
df['is_weekend'] = df['tx_datetime'].dt.dayofweek >= 5

# Sort by customer and transaction datetime
df = df.sort_values(by=['customer_id', 'tx_datetime'])

# Customer behavior features
df['time_since_last_tx'] = df.groupby('customer_id')['tx_datetime'].diff().dt.total_seconds()
df['avg_tx_amount_customer'] = df.groupby('customer_id')['tx_amount'].transform('mean')
df['tx_count_customer'] = df.groupby('customer_id')['tranaction_id'].transform('count')
df['var_tx_amount_customer'] = df.groupby('customer_id')['tx_amount'].transform('std')

# # Terminal-based features
df['avg_tx_amount_terminal'] = df.groupby('terminal_id')['tx_amount'].transform('mean')
df['tx_count_terminal'] = df.groupby('terminal_id')['tranaction_id'].transform('count')
df['var_tx_amount_terminal'] = df.groupby('terminal_id')['tx_amount'].transform('std')



# Assuming X is your feature matrix containing numeric columns
numeric_columns = ['tx_amount', 'time_since_last_tx', 'avg_tx_amount_customer', 'tx_count_customer',
                   'var_tx_amount_customer', 'avg_tx_amount_terminal', 'tx_count_terminal', 'var_tx_amount_terminal']

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numeric columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Convert boolean column to binary (1/0)
df['is_weekend'] = df['is_weekend'].astype(int)

df = df.dropna()


feature_columns = ['tx_amount', 'is_weekend', 'time_since_last_tx', 'avg_tx_amount_customer', 'tx_count_customer',
                   'var_tx_amount_customer', 'avg_tx_amount_terminal', 'tx_count_terminal',
                   'var_tx_amount_terminal']

# Create feature matrix X and target vector y
X = df[feature_columns]
y = df['tx_fraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the RF classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train it on the training data
rf_classifier.fit(X_train_rus, y_train_rus)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.9982355968338765
Confusion Matrix:
 [[232141    381]
 [    51  12269]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    232522
           1       0.97      1.00      0.98     12320

    accuracy                           1.00    244842
   macro avg       0.98      1.00      0.99    244842
weighted avg       1.00      1.00      1.00    244842



In [3]:
import  sys

import argparse

parser = argparse.ArgumentParser(
        description="Model (Inference Pipeline) Training")

parser.add_argument(
    "--val_frac",
    type=float,
    default = 0.2,
    help="Size of the validation split. Fraction of the dataset.",
)

# При запуске используйте оригинальное имя 'Student_Name_flights_LR_only'
parser.add_argument(
    "--output_artifact",
    default="default_run_name",
    type=str,
    help="Name for the output serialized model (Inference Artifact folder)",
    required=True,
)


sys.argv = ['train.ipynb', '--val_frac', '0.2', '--output_artifact', 'run-name']
args = parser.parse_args(sys.argv[1:])


In [4]:
args

Namespace(val_frac=0.2, output_artifact='run-name')