In [33]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Import Library

In [34]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor

# 2. Data Processing

In [35]:
df_train = pd.read_csv("/content/drive/MyDrive/Mercor Fraud Detection/mercor-fraud-detection/train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Mercor Fraud Detection/mercor-fraud-detection/test.csv")
graph = pd.read_csv("/content/drive/MyDrive/Mercor Fraud Detection/mercor-fraud-detection/referral_graph.csv")

In [36]:
print(f"Train\n - Number of Rows: {df_train.shape[0]}\n - Number of Columns: {df_train.shape[1]}")
print(f"Test\n - Number of Rows: {df_test.shape[0]}\n - Number of Columns: {df_test.shape[1]}")

Train
 - Number of Rows: 272819
 - Number of Columns: 21
Test
 - Number of Rows: 48416
 - Number of Columns: 19


In [37]:
print(f"Train missing value:\n{df_train.isna().sum()}")

Train missing value:
user_hash            0
feature_001      32115
feature_002      31705
feature_003      31705
feature_004       2399
feature_005       5799
feature_006       2214
feature_007       5835
feature_008       6096
feature_009       6096
feature_010       6096
feature_011       5816
feature_012      11292
feature_013       5816
feature_014       5880
feature_015          0
feature_016          0
feature_017      28769
feature_018      27635
high_conf_nf    112966
is_fraud        159853
dtype: int64


In [38]:
feature_cols = [
    col for col in df_train.columns
    if col not in ["user_hash", "is_fraud", "high_conf_nf"]
]

## 2.1 Feature Engineering

In [39]:
all_users_feat = pd.concat([
    df_train[["user_hash"] + feature_cols],
    df_test[["user_hash"] + feature_cols]
]).drop_duplicates("user_hash").set_index("user_hash")

In [40]:
graph.columns = ["source", "target"]

In [41]:
rev_graph = graph.rename(columns = {"source": "target", "target": "source"})
rev_graph = rev_graph[graph.columns]
full_edges = pd.concat([graph, rev_graph], ignore_index=True)

In [42]:
full_edges = full_edges.merge(
    all_users_feat,
    left_on = "target",
    right_index = True,
    how = "left"
)

In [43]:
agg_stats = (
    full_edges
    .groupby("source")[feature_cols]
    .agg(["mean", "std"])
)

In [44]:
agg_stats.columns = ['_'.join(col).strip() for col in agg_stats.columns.values]

In [45]:
df_train = df_train.merge(
    agg_stats,
    left_on = "user_hash",
    right_index = True,
    how = "left"
)

df_test = df_test.merge(
    agg_stats,
    left_on = "user_hash",
    right_index = True,
    how = "left"
)

In [46]:
neighbor_cols = list(agg_stats.columns)

In [47]:
df_train[neighbor_cols] = df_train[neighbor_cols].fillna(0)
df_test[neighbor_cols] = df_test[neighbor_cols].fillna(0)

In [48]:
for col in feature_cols:
  nbr_mean = f"{col}_mean"
  if nbr_mean in df_train.columns:
    df_train[f"{col}_ratio"] = df_train[col]/(df_train[nbr_mean] + 1e-5)
    df_test[f"{col}_ratio"] = df_test[col]/(df_test[nbr_mean] + 1e-5)

    df_train[f"{col}_diff"] = df_train[col] - df_train[nbr_mean]
    df_test[f"{col}_diff"] = df_test[col] - df_test[nbr_mean]

In [49]:
neighbor_cols_new = [col for col in df_train.columns if col.endswith("_ratio") or col.endswith("_diff")]

### 2.1.1 Feature Engineering: Graph Topology

In [50]:
G = nx.from_pandas_edgelist(graph, "source", "target", create_using = nx.Graph())

In [51]:
degree_map = dict(G.degree())

In [52]:
try:
  pagerank_map = nx.pagerank(G, alpha = 0.85, max_iter = 30, tol = 1e-4)
except:
  pagerank_map = {n: 0.0 for n in G.nodes()}

In [53]:
component_map = {}

for c in nx.connected_components(G):
  size = len(c)
  for node in c:
    component_map[node] = size

In [54]:
for df in (df_train, df_test):
  df['degree'] = df['user_hash'].map(degree_map).fillna(0)
  df['pagerank'] = df['user_hash'].map(pagerank_map).fillna(0)
  df['component_size'] = df['user_hash'].map(component_map).fillna(1)

  df['degree'] = np.log1p(df['degree'])
  df['component_size'] = np.log1p(df['component_size'])

## 2.3 Filling missing values

In [55]:
full_train = df_train[["user_hash"] + feature_cols + ["high_conf_nf", "is_fraud"]]

In [56]:
full_train_addIndi = full_train.copy()

for col in feature_cols:
    full_train_addIndi[f"is_missing_{col}"] = full_train_addIndi[col].isnull().astype(int)

In [57]:
df_test_addIndi = df_test.copy()

for col in feature_cols:
    df_test_addIndi[f"is_missing_{col}"] = df_test_addIndi[col].isnull().astype(int)

In [26]:
cols_to_impute = [c for c in full_train_addIndi.columns if c.startswith('feature_')]

imp_model = DecisionTreeRegressor(max_depth = 5, random_state=42)
imputer_train = IterativeImputer(estimator=imp_model, max_iter=10, random_state=42)
full_train_addIndi[cols_to_impute] = imputer_train.fit_transform(full_train_addIndi[cols_to_impute].astype('float32'))

In [58]:
df_train = df_train.drop(columns=[
    col for col in df_train.columns
    if col in full_train_addIndi.columns and col != "user_hash"
], axis = 1)

In [59]:
df_train = df_train.merge(
    full_train_addIndi,
    on="user_hash",
    how="left"
)

In [60]:
not_missing_col = [col for col in df_train.columns if col not in neighbor_cols_new + ["is_fraud", "high_conf_nf"]]

In [61]:
df_train[not_missing_col].isna().sum().sum()

np.int64(215268)

In [62]:
df_train.shape, df_test_addIndi.shape

((272819, 114), (48416, 112))

In [63]:
df_train.to_csv("Graph_train.csv", index = False)
df_test_addIndi.to_csv("Graph_test.csv", index = False)