## Tecnologias Utilizadas

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Depois das etapas de modelação e avaliação do modelo, foi feito este ficheiro com o objetivo de correr o código necessário para a submissão no **kaggle**. Foi agregado o código para preparar o treino e criadas as colunas que existiam no treino e que não foram criadas para o teste (**device_os_Linux** e **age_category_Senior**), pois tanto o dispositivo **Linux** como a idade **Senior** não existiam no teste.

In [None]:
cities = pd.read_csv('CreditCardTransactions/cities.csv')
customers = pd.read_csv('CreditCardTransactions/customers.csv')
merchants = pd.read_csv('CreditCardTransactions/merchants.csv')
transactions = pd.read_csv('CreditCardTransactions/test_transactions.csv')

customers_cities = pd.merge(customers, cities, on='city', how='left')
transactions_customers = pd.merge(transactions, customers_cities, on='cc_num', how='left')
final_data = pd.merge(transactions_customers, merchants, on='merchant', how='left')

final_data.to_csv('CreditCardTransactions/merged_teste_dataset.csv', index=False)

data = pd.read_csv('CreditCardTransactions/merged_teste_dataset.csv')
imputed_data = pd.read_csv('CreditCardTransactions/imputed_data.csv')
training_inputs = pd.read_csv('CreditCardTransactions/training_dataset.csv')
training_classes = pd.read_csv('CreditCardTransactions/training_classes.csv')
unscaled_data = pd.read_csv('CreditCardTransactions/unscaled_data.csv')

In [None]:
data['dob'] = pd.to_datetime(data['dob'])
def categorize_age(dob):
    today = datetime.today()
    age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    if 17 <= age <= 36:
        return 'Young Adult'
    elif 37 <= age <= 55:
        return 'Adult'
    elif 56 <= age <= 74:
        return 'Middle Age'
    else:
        return 'Senior'
data['age_category'] = data['dob'].apply(categorize_age)
data.drop(columns=['dob'], inplace=True)

data = data.sort_values(by=['unix_time'])
data['transactions_count'] = (
    data.groupby(['cc_num', 'merchant'])['unix_time']
      .cumcount() 
)
data['transactions_count'] = data['transactions_count'] + 1

unique_merchants = set()
unique_merchants_count = []
for merchant in data['merchant']:
    unique_merchants.add(merchant) 
    unique_merchants_count.append(len(unique_merchants))
data['unique_merchants_count'] = unique_merchants_count
data = data.drop(columns=['merchant'])

data['datetime'] = pd.to_datetime(data['unix_time'], unit='s')
data.sort_values(by=['cc_num', 'datetime'], inplace=True)
data['time_since_last'] = data.groupby('cc_num')['datetime'].diff().dt.total_seconds()
data['time_since_last'] = data['time_since_last'].fillna(0)
data['hour'] = data['datetime'].dt.hour
def categorize_period(hour):
    if 0 <= hour < 8:
        return 'Madrugada'
    elif 8 <= hour < 13:
        return 'Manhã'
    elif 13 <= hour < 18:
        return 'Tarde'
    else:
        return 'Noite'
data['period_of_day'] = data['hour'].apply(categorize_period)
data['day_of_week'] = data['datetime'].dt.day_name()
day_mapping = {'Sunday': 0, 'Monday': 1, 'Tuesday': 2, 'Wednesday': 3,
               'Thursday': 4, 'Friday': 5, 'Saturday': 6}
data['day_of_week_num'] = data['day_of_week'].map(day_mapping)
data['day_of_week_sin'] = np.sin(2 * np.pi * data['day_of_week_num'] / 7)
data['day_of_week_cos'] = np.cos(2 * np.pi * data['day_of_week_num'] / 7)
period_mapping = {'Madrugada': 0, 'Manhã': 1, 'Tarde': 2, 'Noite': 3}
data['period_of_day_num'] = data['period_of_day'].map(period_mapping)
data['period_of_day_sin'] = np.sin(2 * np.pi * data['period_of_day_num'] / 4)
data['period_of_day_cos'] = np.cos(2 * np.pi * data['period_of_day_num'] / 4)
data = data.drop(columns=['hour', 'day_of_week', 'day_of_week_num', 
                          'period_of_day', 'period_of_day_num', 'datetime', 'unix_time', 'cc_num'])

indexes = data[['index']].copy()
data = data.drop(columns= ['index', 'first', 'last', 'lat', 'long', 'merch_lat', 'merch_long', 'merchant_id', 'trans_date_trans_time', 'street', 'zip', 'state', 'city_pop', 'trans_num'])

numeric_columns = unscaled_data.select_dtypes(include=['float64' , 'int64']).columns
scaler = MinMaxScaler()
unscaled_data[numeric_columns] = scaler.fit_transform(unscaled_data[numeric_columns])
data[numeric_columns] = scaler.transform(data[numeric_columns])

numerical_cols = imputed_data.select_dtypes(include=['number']).columns
categorical_cols = imputed_data.select_dtypes(include=['object', 'category']).columns
mean_values = imputed_data[numerical_cols].mean()
mode_values = imputed_data[categorical_cols].mode().iloc[0]
data[numerical_cols] = data[numerical_cols].fillna(mean_values)
data[categorical_cols] = data[categorical_cols].fillna(mode_values)

data['city_has_info'] = data['city'].apply(lambda x: 0 if x == 'Test City' else 1)

categorical_columns1 = ['gender']
data = pd.get_dummies(data, columns=categorical_columns1, drop_first=True, dtype=int)
categorical_columns2 = ['device_os', 'category', 'age_category', 'job', 'city']
data = pd.get_dummies(data, columns=categorical_columns2, drop_first=False, dtype=int)

testing_inputs = data.copy()

testing_inputs['device_os_Linux'] = 0
colunas = testing_inputs.columns.tolist()
colunas.insert(10, colunas.pop(colunas.index('device_os_Linux')))
testing_inputs = testing_inputs[colunas]

testing_inputs['age_category_Senior'] = 0
colunas = testing_inputs.columns.tolist()
colunas.insert(22, colunas.pop(colunas.index('age_category_Senior')))
testing_inputs = testing_inputs[colunas]

In [None]:
pipeline = Pipeline([
    ('smote_tomek', SMOTETomek(random_state=42, sampling_strategy=0.4)),
    ('decision_tree', DecisionTreeClassifier(random_state=42))
])

sf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv_scores = cross_val_score(pipeline, training_inputs, training_classes, cv=sf, scoring='roc_auc')
pipeline.fit(training_inputs, training_classes)

predictions = pipeline.predict(testing_inputs)
probas = pipeline.predict_proba(testing_inputs)[:, 1]

submission = pd.DataFrame({
    'index': indexes['index'],
    'is_fraud': probas  
})

submission.to_csv('tentativas/teste1.csv', index=False)

In [None]:
pipeline = Pipeline([
    ('smote_tomek', SMOTETomek(random_state=42, sampling_strategy=0.4)),
    ('svm', SVC(probability=True, random_state=42))
])

sf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv_scores = cross_val_score(pipeline, training_inputs, training_classes, cv=sf, scoring='roc_auc')
pipeline.fit(training_inputs, training_classes)

predictions = pipeline.predict(testing_inputs)
probas = pipeline.predict_proba(testing_inputs)[:, 1]

submission = pd.DataFrame({
    'index': indexes['index'],
    'is_fraud': probas  
})

submission.to_csv('tentativas/teste2.csv', index=False)

In [None]:
pipeline = Pipeline([
    ('smote_tomek', SMOTETomek(random_state=42, sampling_strategy=0.4)),
    ('logistic_regression', LogisticRegression(random_state=42, max_iter=1000))
])

param_grid = {
    'logistic_regression__penalty': ['l1', 'l2'],
    'logistic_regression__C': [0.1, 1.0],
    'logistic_regression__solver': ['liblinear', 'saga']
}

sf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=sf,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(training_inputs, training_classes)
best_pipeline = grid_search.best_estimator_

predictions = best_pipeline.predict(testing_inputs)
probas = best_pipeline.predict_proba(testing_inputs)[:, 1]

submission = pd.DataFrame({
    'index': indexes['index'],
    'is_fraud': probas  
})

submission.to_csv('tentativas/teste3.csv', index=False)

In [None]:
pipeline = Pipeline([
    ('smote_tomek', SMOTETomek(random_state=42, sampling_strategy=0.4)),
    ('neural_network', MLPClassifier(max_iter=1000, random_state=42))
])

sf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv_scores = cross_val_score(pipeline, training_inputs, training_classes, cv=sf, scoring='roc_auc')
pipeline.fit(training_inputs, training_classes)

predictions = pipeline.predict(testing_inputs)
probas = pipeline.predict_proba(testing_inputs)[:, 1]

submission = pd.DataFrame({
    'index': indexes['index'],
    'is_fraud': probas  
})

submission.to_csv('tentativas/teste4.csv', index=False)

In [None]:
random_forest = RandomForestClassifier(random_state=42)

random_forest_parameter_grid = {    
    'model__max_features': ['log2', 'sqrt'],  
    'model__criterion': ['gini', 'entropy']
}

pipeline = Pipeline([
    ('smote', SMOTETomek(random_state=42, sampling_strategy=0.4)),
    ('model', random_forest)
])

sf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
random_forest_grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=random_forest_parameter_grid,
    scoring='roc_auc',
    cv=sf,
    n_jobs=-1
)

random_forest_grid_search.fit(training_inputs, training_classes)
best_pipeline = random_forest_grid_search.best_estimator_

predictions = best_pipeline.predict(testing_inputs)
probas = best_pipeline.predict_proba(testing_inputs)[:, 1]

submission = pd.DataFrame({
    'index': indexes['index'],
    'is_fraud': probas  
})

submission.to_csv('tentativas/teste5.csv', index=False)

Optei por executar todos os modelos que implementei na fase anterior diretamente no **Kaggle**, descartando os resultados obtidos localmente. Essa decisão mostrou-se acertada, uma vez que as pontuações obtidas localmente nem sempre correspondiam às obtidas no **Kaggle**.

Abaixo, encontra-se uma tabela que expõe o nome do teste, o modelo utilizado e a pontuação obtida no **Kaggle**.
O código para chegar a estes resultados está nos blocos de código acima.

| Nome do Teste | Modelo Utilizado | Valor Obtido |
|---------------|------------------|--------------|
| teste1.csv    |  Árvore de Decisão Sem GridSearch      | 0.50345|
| teste2.csv    | SVM Sem GridSearch | 0.54154      | 
| teste3.csv    | Logistic Regression Com GridSearch | 0.48420      |
| teste4.csv    | Neural Network Sem GridSearch| 0.63801      |
| teste5.csv    | Random Forest Com GridSearch | 0.59903  | 