In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from tensorflow.keras.layers import BatchNormalization, LeakyReLU

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the training dataset
train_df = pd.read_csv('train.csv')

# Load the test dataset
test_df = pd.read_csv('test.csv')

In [33]:
train_df.drop('SEQN', axis=1, inplace=True)
X = train_df.drop('y', axis=1)
y = train_df['y']

# Define preprocessing for numeric and categorical columns
numeric_features = X.columns.drop('district').tolist()
categorical_features = ['district']

# Define the preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Define the model architecture
def build_model(input_shape):
    dropout_rate=0.4
    model = Sequential([
        Dense(256, input_shape=(input_shape,)),
        BatchNormalization(),
        LeakyReLU(),
        Dropout(dropout_rate),  # Adjusted dropout
        Dense(128),
        BatchNormalization(),
        LeakyReLU(),
        Dropout(dropout_rate),  # Adjusted dropout
        Dense(64),
        BatchNormalization(),
        LeakyReLU(),
        Dropout(dropout_rate),  # Adjusted dropout
        Dense(32),
        BatchNormalization(),
        LeakyReLU(),
        Dropout(dropout_rate / 2),  # Adjusted dropout
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    return model

# Prepare for k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_var = 1
results = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Apply preprocessing
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)

    # Build and fit the model
    model = build_model(X_train_preprocessed.shape[1])
    model.fit(
        X_train_preprocessed, y_train,
        epochs=200,
        validation_data=(X_val_preprocessed, y_val),
        callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
        verbose=0  # You can set verbose=1 if you want to see progress
    )

    # Evaluate the model
    predictions = model.predict(X_val_preprocessed)
    r2 = r2_score(y_val, predictions)
    results.append(r2)
    print(f'R2 score for fold {fold_var}: {r2}')
    fold_var += 1

# Calculate average performance across all folds
average_r2_score = np.mean(results)
print(f'Average R2 score across all folds: {average_r2_score}')

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step
R2 score for fold 1: 0.875362333614408
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 637us/step
R2 score for fold 2: 0.8767474514117136
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 550us/step
R2 score for fold 3: 0.8832353894696784
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 512us/step
R2 score for fold 4: 0.8760545733365399
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597us/step
R2 score for fold 5: 0.8816353421383687
Average R2 score across all folds: 0.8786070179941416


# Preparing text output file for Kaggle submission

In [16]:
train_df = pd.read_csv('train.csv') 
test_df = pd.read_csv('test.csv')

train_df.drop('SEQN', axis=1, inplace=True)
test_df.drop('SEQN', axis=1, inplace=True)

X_train = train_df.drop('y', axis=1)
y_train = train_df['y']
X_test = test_df 


numeric_features = X_train.columns.drop('district').tolist()
categorical_features = ['district']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(), categorical_features)
])


X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [17]:
# Make predictions on the test set
predictions = model.predict(X_test_preprocessed)
test_df = pd.read_csv('test.csv')  # Update the path
# Create a submission DataFrame
submission = pd.DataFrame({
    'SEQN': test_df['SEQN'],  # Assuming you need to include an identifier in your submission
    'y': predictions.flatten()
})

# Save the submission file
submission.to_csv('submit_10.csv', index=False)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 569us/step


## Testing simple(ignore)

In [29]:
df_nn = pd.read_csv("submit_6.csv")
df_lgbm = pd.read_csv("submit_2.csv")
df_nn2 = pd.read_csv("submit_5.csv")

In [30]:
nn_predictions = df_nn['y']
nn2_predictions = df_nn2['y']
gb_predictions = df_lgbm['y']
combined_predictions = 0.6 * nn_predictions + 0.1 * gb_predictions +  0.3 * nn2_predictions
test_df = pd.read_csv('test.csv')  # Update the path
# Create a submission DataFrame
submission = pd.DataFrame({
    'SEQN': test_df['SEQN'],  # Assuming you need to include an identifier in your submission
    'y': combined_predictions
})

In [31]:
submission.to_csv('submit_9.csv', index=False)