In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE


In [2]:
# Load the data
train_data = pd.read_csv("/content/RevisedHomesiteTrain1.csv")
test_data = pd.read_csv("/content/RevisedHomesiteTest1.csv")


# Explore data
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print(train_data.info())
print(train_data.head())



Train data shape: (65000, 596)
Test data shape: (173836, 596)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65000 entries, 0 to 64999
Columns: 596 entries, CoverageField11A to QuoteConversion_Flag
dtypes: float64(5), int64(591)
memory usage: 295.6 MB
None
   CoverageField11A  CoverageField11B  CoverageField1A  CoverageField1B  \
0                 2                 1               17               23   
1                 5                 9                6                8   
2                 4                 6                7               12   
3                15                23                3                2   
4                 4                 6                8               13   

   CoverageField2A  CoverageField2B  CoverageField3A  CoverageField3B  \
0               17               23               15               22   
1                6                8                5                7   
2                7               12                6               10 

In [None]:
sample_data = pd.read_csv('/content/RevisedHomesiteTest1.csv')
sample_data.shape

(173836, 596)

In [3]:
# Separating features and target variable
X = train_data.drop(columns=['QuoteConversion_Flag'])
y = train_data['QuoteConversion_Flag']

In [None]:
# Check for class imbalance
print("Class distribution in target:\n", y.value_counts())


Class distribution in target:
 QuoteConversion_Flag
0    52738
1    12262
Name: count, dtype: int64


In [4]:
# Splitting the data
# Imputing or removing NaN values from 'y' before splitting
y = y.fillna(y.mode()[0])  # Imputing with the most frequent value
# or
# y = y.dropna()  # Removing rows with NaN values in 'y'

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Import the necessary library
from sklearn.preprocessing import StandardScaler

# Create and fit the scaler on the training data (assuming X_train exists)
scaler = StandardScaler()
scaler.fit(X_train)

# Check if

In [7]:
common_cols = X_train.columns.intersection(test_data.columns)
X_train_scaled = scaler.fit_transform(X_train[common_cols])
X_valid_scaled = scaler.transform(X_valid[common_cols])
X_test_scaled = scaler.transform(test_data[common_cols])

In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Before scaling, impute NaN values in X_train and X_valid
imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
X_train[common_cols] = imputer.fit_transform(X_train[common_cols])
X_valid[common_cols] = imputer.transform(X_valid[common_cols])
test_data[common_cols] = imputer.transform(test_data[common_cols])


# Now scale the data
X_train_scaled = scaler.fit_transform(X_train[common_cols])
X_valid_scaled = scaler.transform(X_valid[common_cols])
X_test_scaled = scaler.transform(test_data[common_cols])

In [10]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print("Class distribution after SMOTE:\n", pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
 QuoteConversion_Flag
0    42190
1    42190
Name: count, dtype: int64


In [11]:
!pip install scikit-learn
import pandas as pd
from sklearn.impute import SimpleImputer

# Create an imputer (using the mean strategy as an example)
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on your training data (X_train_scaled or the original X_train)
imputer.fit(X_train_scaled)  # Or imputer.fit(X_train[common_cols])

# Transform both training and validation data
X_train_scaled = imputer.transform(X_train_scaled)
X_valid_scaled = imputer.transform(X_valid_scaled)
X_test_scaled = imputer.transform(X_test_scaled)



In [None]:
# Initialize models
mlp = MLPClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()

# Train and evaluate each model
models = {'MLP': mlp, 'SVM': svm, 'DecisionTree': dt, 'RandomForest': rf, 'KNN': knn}
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_valid_scaled)
    y_proba = model.predict_proba(X_valid_scaled)[:, 1] if hasattr(model, "predict_proba") else None
    print(f"\n{name} Classification Report:\n", classification_report(y_valid, y_pred))
    if y_proba is not None:
        print(f"{name} AUC-ROC Score:", roc_auc_score(y_valid, y_proba))


MLP Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.93     10548
           1       0.71      0.67      0.69      2452

    accuracy                           0.89     13000
   macro avg       0.82      0.80      0.81     13000
weighted avg       0.88      0.89      0.88     13000

MLP AUC-ROC Score: 0.9134201662438346

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93     10548
           1       0.72      0.69      0.70      2452

    accuracy                           0.89     13000
   macro avg       0.82      0.81      0.82     13000
weighted avg       0.89      0.89      0.89     13000

SVM AUC-ROC Score: 0.9309366882444026

DecisionTree Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     10548
           1       0.68      0.69      0.69      2452

    accura

In [None]:
import joblib
joblib.dump(svm, 'svm_model.pkl')
joblib.dump(dt, 'dt_model.pkl')
joblib.dump(rf, 'rf_model.pkl')
joblib.dump(knn, 'knn_model.pkl')

['knn_model.pkl']

In [None]:
# Ensemble predictions using stacking
estimators = [
    ('MLP', mlp),
    ('SVM', svm),
    ('DecisionTree', dt),
    ('RandomForest', rf),
    ('KNN', knn)
]
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(), cv=5)
stacking.fit(X_train_resampled, y_train_resampled)

# Evaluate the stacked model
y_pred_stacked = stacking.predict(X_valid_scaled)
y_proba_stacked = stacking.predict_proba(X_valid_scaled)[:, 1]
print("\nStacked Model Classification Report:\n", classification_report(y_valid, y_pred_stacked))
print("Stacked Model AUC-ROC Score:", roc_auc_score(y_valid, y_proba_stacked))

In [None]:
# Hyperparameter tuning for stacked model
param_grid = {
    'final_estimator__n_estimators': [50, 100, 200],
    'final_estimator__max_depth': [5, 10, 15]
}
grid_search = GridSearchCV(stacking, param_grid, scoring='roc_auc', cv=3)
grid_search.fit(X_train_resampled, y_train_resampled)
print("Best parameters for stacking model:", grid_search.best_params_)


In [13]:
import joblib
svm = joblib.load('svm_model.pkl')

In [17]:
test_data['QuoteNumber']

Unnamed: 0,QuoteNumber
0,3.0
1,5.0
2,7.0
3,9.0
4,10.0
...,...
173831,434570.0
173832,434573.0
173833,434574.0
173834,434575.0


In [19]:
submission = pd.DataFrame({'QuoteNumber': test_data['QuoteNumber'], 'QuoteConversion_Flag': test_predictions})
submission['QuoteNumber']=submission['QuoteNumber'].astype('int32')
submission.to_csv('Homesite_submission_svm.csv', index=False)
print("Submission saved!")


Submission saved!
