In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [2]:
# Calculate the average transaction amount for each user
user_avg_amt = train_df.groupby('cc_num')['amt'].mean().reset_index(name='Avg_Amt')

# Merge this average back into the original DataFrame
train_df = train_df.merge(user_avg_amt, on='cc_num')

# Calculate the relative amount feature
train_df['Relative_Amt'] = abs(train_df['amt'] - train_df['Avg_Amt']) / train_df['Avg_Amt']

print(train_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 25 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Id                     555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  float64
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Function to calculate the Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Difference in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Distance in kilometers
    distance = R * c
    return distance


def process(df):
    # Add new features
    date_time = pd.to_datetime(df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')
    birth_date = pd.to_datetime(df['dob'], format='%d/%m/%Y')

    df['trans_hour'] = date_time.dt.hour
    df['trans_day_of_week'] = date_time.dt.dayofweek
    
    df['age'] = (date_time - birth_date).dt.days // 365
    df['trans_dist'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

    user_avg_amt = df.groupby('cc_num')['amt'].mean().reset_index(name='Avg_Amt')
    df = df.merge(user_avg_amt, on='cc_num')
    df['Relative_Amt'] = abs(df['amt'] - df['Avg_Amt']) / df['Avg_Amt']

    df.drop(columns=['trans_date_trans_time', 'lat', 'long', 'merch_lat', 'merch_long', 'Avg_Amt'], inplace=True)

    # Identifying categorical columns
    categorical_cols = ['merchant', 'category', 'gender', 'city', 'state', 'job']

    mappings = {}

    label_encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])
        mappings[col] = {label: index for index, label in enumerate(label_encoder.classes_)}

    return df, mappings

trainingSet = pd.read_csv("./data/train.csv")
submissionSet = pd.read_csv("./data/test.csv")
train_processed, cat_map = process(trainingSet)
train_processed.drop(columns=['cc_num', 'first', 'last', 'street', 'dob', 'zip', 'trans_num', 'unix_time'], inplace=True)

# Merge on Id so that the test set can have feature columns as well
test_df= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
test_df = test_df.drop(columns=['is_fraud_x'])
test_df = test_df.rename(columns={'is_fraud_y': 'is_fraud'})

# The training set is where the score is not null
train_df = train_processed[train_processed['is_fraud'].notnull()]

# Save the datasets with the new features for easy access later
test_df.to_csv("./data/test_processed.csv", index=False)
train_df.to_csv("./data/train_processed.csv", index=False)


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

scaled_cols = ['amt', 'city_pop', 'trans_dist']
# Assuming 'train_df' includes both features and the target ('is_fraud')
X = train_df.drop(['is_fraud', 'Id'], axis=1)  # Dropping the target to isolate features
y = train_df['is_fraud']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [3]:
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit on the training set and transform both training and validation sets
X_train[scaled_cols] = scaler.fit_transform(X_train[scaled_cols])
X_val[scaled_cols] = scaler.transform(X_val[scaled_cols])

In [5]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

param_grid_knn = {
    'knn__n_neighbors': range(1, 21),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

knn_pipeline = Pipeline([
    ('pca', PCA(n_components=0.95)),
    ('knn', KNeighborsClassifier())
])

f1_score_knn = make_scorer(f1_score)

grid_search_knn = GridSearchCV(knn_pipeline, param_grid_knn, cv=5, scoring=f1_score_knn)

grid_search_knn.fit(X_train_smote, y_train_smote)

best_params_knn = grid_search_knn.best_params_
best_score_knn = grid_search_knn.best_score_

best_knn = grid_search_knn.best_estimator_
y_pred_knn = best_knn.predict(X_val)
f1_score_knn = f1_score(y_val, y_pred_knn)
print(best_params_knn, best_score_knn, f1_score_knn)

{'knn__metric': 'manhattan', 'knn__n_neighbors': 1, 'knn__weights': 'uniform'} 0.9833914058704908 0.04595814526056627


In [31]:
import pickle
with open('knn_model.obj', 'wb') as f:
        pickle.dump(best_knn, f)
# to load pickled model: 
# with open('filename', 'rb') as f:
#    model = pickle.load(f)

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

# Initialize an SVM model
svc = SVC(probability=True, class_weight='balanced', random_state=42)

# Define the parameter grid: 
param_grid = {
    'C': [0.1, 1, 10, 50],  # Regularization parameter
    'kernel': ['rbf', 'linear'],  # Type of hyperplane
    'gamma': [0.0001, 0.0005, 0.001, 0.005]  # Kernel coefficient
}

# Note: This grid is somewhat small for speed considerations. Expand it as needed.
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=3)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# Best model
best_model = grid_search.best_estimator_

with open('svc_model.obj', 'wb') as f:
        pickle.dump(best_model, f)

# Predictions on the validation set
y_pred = best_model.predict(X_val)

# Evaluation
print(f1_score(y_val, y_pred))
print(classification_report(y_val, y_pred))


Fitting 5 folds for each of 32 candidates, totalling 160 fits


KeyboardInterrupt: 

In [6]:
print(X_train_smote.head())
print(X_train_smote.info()) 


   merchant  category       amt  gender  city  state  city_pop  job  \
0       398         7 -0.132040       1   233     11 -0.289177  463   
1       476         0 -0.142689       0    98     33  8.010544  279   
2       371        10 -0.026113       0   437     16  2.147820  138   
3       671         7  0.087644       1   723     21 -0.041744  148   
4        61         7 -0.097086       0   454     49 -0.292340  139   

   trans_hour  trans_day_of_week  age  trans_dist  Relative_Amt  
0          14                  1   46   -0.022289      0.507149  
1          22                  5   21    0.890884      0.016317  
2          22                  6   59   -0.283184      0.012243  
3          21                  6   45    1.159779      0.293297  
4          12                  1   39   -1.071097      0.086756  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775002 entries, 0 to 775001
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------      

In [18]:
# Assuming the prediction for 'category' was 2
predicted = 380

# Find the original category from the mapping
original_category = {v: k for k, v in cat_map['city'].items()}[predicted]
print(f"The predicted category is: {original_category}")


The predicted category is: Kenner
