In [None]:
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

train=pd.read_csv("train.csv")
old_transactions=pd.read_csv("cleaned_historical_transactions.csv")
new_transactions=pd.read_csv("cleaned_new_merchant_transactions.csv")


In [None]:
train

In [None]:
#No Null Values
new_transactions.isnull().values.any()

In [None]:
#No Duplicate Values
old_transactions[old_transactions.duplicated()]

In [None]:
print("Train\n",train.info())
print("Old_transactions\n",old_transactions.info())
print("New_transactions\n",new_transactions.info())

In [None]:
old_transactions['dataset']='old'

In [None]:
old_transactions

In [None]:
new_transactions['dataset']='new'

In [None]:
new_transactions

In [None]:
transactions = pd.concat([old_transactions, new_transactions], axis=0, ignore_index=True)


In [None]:
transactions

In [None]:
#Convert to datetime
transactions['purchase_date'] = pd.to_datetime(transactions['purchase_date'])

#Extract features
transactions['purchase_year'] = transactions['purchase_date'].dt.year
transactions['purchase_month'] = transactions['purchase_date'].dt.month
transactions['purchase_day'] = transactions['purchase_date'].dt.day
transactions['purchase_weekday'] = transactions['purchase_date'].dt.weekday


In [None]:
transactions

In [None]:
transactions=transactions.drop('merchant_id',axis=1)

In [None]:
transactions=transactions.drop('city_id',axis=1)

In [None]:
transactions=transactions.drop('state_id',axis=1)

In [None]:
transactions

In [None]:
transactions['card_id'] = transactions['card_id'].astype(str)
print(transactions['card_id'].dtype)  #Should now be 'string' or 'object'
print(transactions['card_id'].head())  


In [None]:
transactions.drop('dataset',axis=1,inplace=True)

In [None]:
transactions

In [None]:
agg_features = transactions.groupby('card_id').agg({
    'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],  # Spending behavior
    'installments': ['sum', 'mean'],  # Installment usage
    'authorized_flag': ['sum'],  # Total authorized transactions
    'category_1': ['sum'],  # Transactions in category_1
    'category_2': ['nunique'],  # Unique category_2 values
    'category_3': ['nunique'],  # Unique category_3 values
    'merchant_category_id': ['nunique'],  # Unique merchant categories
    'subsector_id': ['nunique'],  # Unique economic subsectors
    'month_lag': ['min', 'max'],  # Recency of transactions
    'purchase_date':['min','max'],
}).reset_index()
agg_features['spending_frequency'] = transactions.groupby('card_id').size()
agg_features['avg_spend_per_transaction'] = agg_features['purchase_amount', 'sum'] / agg_features['spending_frequency']
agg_features['days_between_first_last'] = (agg_features['purchase_date', 'max'] - agg_features['purchase_date', 'min']).dt.days
agg_features['transaction_velocity'] = agg_features['spending_frequency'] / agg_features['days_between_first_last']


In [None]:
agg_features

In [None]:
agg_features.rename(columns={'authorized_flag':'num_authorized_transactions'},inplace=True)
agg_features.rename(columns={'installments':'num_installment_transactions'},inplace=True)

In [None]:
agg_features

In [None]:
agg_features.dtypes

In [None]:
# Flatten column names if they are multi-indexed after aggregation
agg_features.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in agg_features.columns]

# Reset index to make 'card_id' a column
agg_features.reset_index(inplace=True)
agg_features.rename(columns={'card_id_':'card_id'},inplace=True)
agg_features

In [None]:
agg_features=agg_features.drop(labels=['purchase_date_min','purchase_date_max'],axis=1)


In [None]:
agg_features.dtypes

In [None]:
import pandas as pd
from datetime import datetime

# Convert first_active_month to datetime
train['first_active_month'] = pd.to_datetime(train['first_active_month'])

# Compute account age in months
current_date = datetime.today()
train['account_age_months'] = (current_date.year - train['first_active_month'].dt.year) * 12 + \
                              (current_date.month - train['first_active_month'].dt.month)




In [None]:
train

In [None]:
train = train.merge(agg_features, on='card_id', how='left')
train.fillna(0, inplace=True)
train

In [None]:
train.dtypes

In [None]:
train=train.drop(['first_active_month'],axis=1)

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # 1. Select Numerical Features
# numerical_features = train.select_dtypes(include=['number']).columns.tolist()

# # 3. Create a Min-Max Scaler
# scaler = MinMaxScaler()

# # 4. Fit and Transform the Numerical Features
# train[numerical_features] = scaler.fit_transform(train[numerical_features])


In [None]:
from sklearn.model_selection import train_test_split
X=train.drop(columns=['card_id','target'])
Y=train['target']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"Training Set Size: {X_train.shape}")
print(f"Testing Set Size: {X_test.shape}")


In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'subsample': 0.8
}

scores = []
for train_idx, val_idx in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_train_fold, y_train_fold)
    lgb_val = lgb.Dataset(X_val_fold, y_val_fold)
    
    model = lgb.train(params, lgb_train, num_boost_round=1000, 
                      valid_sets=[lgb_train, lgb_val], 
                      early_stopping_rounds=50, verbose_eval=100)
    
    y_pred = model.predict(X_val_fold, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    scores.append(rmse)

print(f"Average RMSE: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params= {
    "objective":"reg:squarederror",
    "learning_rate":0.01,   # Step size per iteration
    "max_depth":6,          # Depth of trees
}

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=1000,
    nfold=5,
    early_stopping_rounds=50,
    metrics=["rmse", "mae"],
    seed=42,
    verbose_eval=100
)

# Print the best number of rounds and the corresponding scores
print(f"Best number of rounds: {len(cv_results)}")
print(f"Best RMSE: {cv_results['test-rmse-mean'].min():.4f} (+/- {cv_results['test-rmse-std'].min():.4f})")
print(f"Best MAE: {cv_results['test-mae-mean'].min():.4f} (+/- {cv_results['test-mae-std'].min():.4f})")

# Train the final model using the best number of rounds
best_model = xgb.train(params, dtrain, num_boost_round=len(cv_results))

# Make predictions on test data
y_pred = best_model.predict(dtest)

# Evaluate model performance
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print(f'Test RMSE: {rmse:.4f}')
print(f'Test MAE: {mae:.4f}')

feature_importance = best_model.get_score(importance_type='weight')
sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importance)), [val[1] for val in sorted_importance])
plt.xticks(range(len(sorted_importance)), [val[0] for val in sorted_importance], rotation=90)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# from sklearn.model_selection import cross_val_score

# dtrain = xgb.DMatrix(X_train, label=y_train)
# dtest = xgb.DMatrix(X_test, label=y_test)

# params= {
#     "objective":"reg:squarederror",
#     "n_estimators":1000,     # Number of trees
#     "learning_rate":0.01,   # Step size per iteration
#     "max_depth":6,          # Depth of trees
# }

# cv_results = xgb.cv(
#     params,
#     dtrain,
#     num_boost_round=1000,
#     nfold=5,
#     early_stopping_rounds=50,
#     metrics=["rmse", "mae"],
#     seed=42,
#     verbose_eval=100
# )

# # Print the best number of rounds and the corresponding scores
# print(f"Best number of rounds: {len(cv_results)}")
# print(f"Best RMSE: {cv_results['test-rmse-mean'].min():.4f} (+/- {cv_results['test-rmse-std'].min():.4f})")
# print(f"Best MAE: {cv_results['test-mae-mean'].min():.4f} (+/- {cv_results['test-mae-std'].min():.4f})")

# # Train the final model using the best number of rounds
# best_model = xgb.train(params, dtrain, num_boost_round=len(cv_results))

# # Make predictions on test data
# y_pred = best_model.predict(dtest)

# # Evaluate model performance
# rmse = mean_squared_error(y_test, y_pred, squared=False)
# mae = mean_absolute_error(y_test, y_pred)

# print(f'Test RMSE: {rmse:.4f}')
# print(f'Test MAE: {mae:.4f}')

In [None]:
print(train['target'].std())
