In [57]:
import numpy as np
import pandas as pd

In [58]:
import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1[:, np.newaxis, :] - X2) ** 2, axis=2))

    def predict(self, X):
        X = np.array(X)
        distances = self.compute_distance(X, self.X_train)
        k_indices = np.argsort(distances, axis=1)[:, :self.k]
        k_labels = self.y_train[k_indices]
        return np.array([Counter(k).most_common(1)[0][0] for k in k_labels])


In [59]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# Define the paths to your dataset
train_path = 'train.csv'
test_path = 'test.csv'

def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Separate features and labels
    y_train = train_data['Exited']
    X_train = train_data.drop(columns=['Exited', 'id', 'Surname'])
    X_test = test_data.drop(columns=['id', 'Surname'])
    
    # Numeric and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns
    
    # Preprocessing for numerical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Bundle preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    # Apply preprocessing
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    return X_train, y_train, X_test


In [63]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Define the paths to your dataset
train_path = 'train.csv'
test_path = 'test.csv'

def preprocess_data(train_path, test_path):
    # Load the data
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

def feature_engineering(data):
    # New feature based on the interaction between balance and number of products
    data['Balance_per_Product'] = data['Balance'] / (data['NumOfProducts'] + 1)
    data['Age*CreditScore'] = data['Age'] * data['CreditScore']
    
    # Scale features
    scaler = MinMaxScaler()
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Balance_per_Product', 'Age*CreditScore']
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    
    return data

# Assuming you have loaded your dataset into `train_data` and `test_data`
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)


In [61]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def cross_validate(X, y, model, n_splits=5):
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        score = roc_auc_score(y_test, predictions)
        scores.append(score)
    
    return np.mean(scores)


In [62]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)
print("Cross-validation scores:", cv_scores)

# Train on full dataset with optimal hyperparameters
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Load test identifiers
test_data = pd.read_csv('test.csv')
test_ids = test_data[['id', 'CustomerId']]  # Assuming 'ID' is the column name for the general identifier

# Create the submission DataFrame using both CustomerId and ID
submission = pd.DataFrame({
    'id': test_ids['id'],
    'Exited': test_predictions
})
submission.to_csv('submissions.csv', index=False)


Cross-validation scores: 0.7727432473051843
