In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import joblib

In [2]:
# Load the dataset
file_path = '/kaggle/input/machinelearningdataexam/credit_scores.csv'
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())

       ID Customer_ID     Month             Name   Age          SSN  \
0  0x1608   CUS_0xd40      July    Aaron Maashoh  23.0  821-00-0265   
1  0x160f  CUS_0x21b1  February  Rick Rothackerj  28.0  004-07-5839   
2  0x1612  CUS_0x21b1       May  Rick Rothackerj  28.0  004-07-5839   
3  0x1613  CUS_0x21b1      June  Rick Rothackerj  28.0  004-07-5839   
4  0x1615  CUS_0x21b1    August  Rick Rothackerj  28.0  004-07-5839   

  Occupation  Annual_Income  Monthly_Inhand_Salary  Num_Bank_Accounts  ...  \
0  Scientist       19114.12            1824.843333                3.0  ...   
1    Teacher       34847.84            3037.986667                2.0  ...   
2    Teacher       34847.84            3037.986667                2.0  ...   
3    Teacher       34847.84            3037.986667                2.0  ...   
4    Teacher       34847.84            3037.986667                2.0  ...   

   Credit_Score  Count_Auto Loan  Count_Credit-Builder Loan  \
0          Good              1.0         

In [3]:
# Drop specified columns
df.drop(columns=["Name", "SSN", "ID", "Customer_ID"], inplace=True)


In [4]:
# Use a subset of the data for initial parameter tuning
X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.8, random_state=42)

NameError: name 'X' is not defined

In [None]:
# Define target variable
target = 'Credit_Score'
X = df.drop(columns=[target])
y = df[target]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Preprocessing for numerical data
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10))  # Adding PCA to reduce dimensionality
])

In [None]:
# Preprocessing for categorical data
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])


In [None]:
# Define a smaller parameter grid for RandomizedSearchCV
param_dist = {
    'classifier__C': [0.1, 1, 10],
    'classifier__gamma': [0.1, 0.01, 0.001],
    'classifier__kernel': ['rbf', 'linear']
}

In [None]:
# Perform randomized search with cross-validation using parallel processing
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_sample, y_sample)


In [None]:
# Best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best parameters found: ", best_params)
print("Best cross-validation accuracy: {:.2f}".format(best_score))


In [None]:
# Now split the data into training and test sets for the final model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train the model on the full dataset with the best parameters found
model.set_params(**random_search.best_params_)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(accuracy))


In [None]:
# Save the model to disk
joblib.dump(model, 'best_model.pkl')