<a href="https://colab.research.google.com/github/lvirany/Work-Projects/blob/main/Statlog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
%%capture
!pip install cirq
!pip install ucimlrepo

In [17]:
import numpy as np
import pandas as pd
import cirq
import gc # Import the gc module

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo
import pickle

# Fetch the dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# Load dataset features and targets
X = statlog_german_credit_data.data.features  # Features as DataFrame
y = statlog_german_credit_data.data.targets   # Targets as Series

# Inspect dataset metadata
#print(statlog_german_credit_data.metadata)
print(statlog_german_credit_data.variables)

# Preprocess the data
# Normalize the features to [0, 1]
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Encode categorical features
categorical_columns = ['Attribute1', 'Attribute3', 'Attribute4', 'Attribute6', 'Attribute7',
                       'Attribute9', 'Attribute10', 'Attribute12', 'Attribute14', 'Attribute15',
                       'Attribute17', 'Attribute19', 'Attribute20']

label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    # Apply the label encoding and update the DataFrame
    X.loc[:, col] = label_encoders[col].fit_transform(X[col])

# Clear label encoders and categorical column list
del label_encoders, categorical_columns
gc.collect()

# Normalize the features to [0, 1]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Apply PCA, if necessary, to reduce the number of features
pca = PCA(n_components=18)  # Adjust components based on available memory
X_reduced = pca.fit_transform(X_normalized)

del X, X_normalized
import gc
gc.collect()

# Split into training and testing sets
# Update train-test split with reduced features
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Clear reduced dataset after splitting
del X_reduced
gc.collect()

# Convert the target (y) to a 1D array (addresses warnings)
y_train = y_train['class'].to_numpy()
y_test = y_test['class'].to_numpy()

# Update the number of qubits
num_qubits = X_train.shape[1]  # Updated to reduced feature count

# Define the Quantum Reservoir
def create_quantum_reservoir(num_qubits, depth):
    qubits = [cirq.GridQubit(0, i) for i in range(num_qubits)]
    circuit = cirq.Circuit()
    for _ in range(depth):
        for qubit in qubits:
            circuit.append(cirq.rx(np.random.rand() * 2 * np.pi)(qubit))
        for i in range(len(qubits) - 1):
            circuit.append(cirq.CNOT(qubits[i], qubits[i + 1]))
    return circuit, qubits

# Initialize the quantum reservoir
num_qubits = X_train.shape[1]  # Number of features as qubits
reservoir_depth = 3
quantum_circuit, quantum_qubits = create_quantum_reservoir(num_qubits, reservoir_depth)

# Encode classical data into quantum states
def encode_classical_data_optimized(data, qubits):
    circuit = cirq.Circuit()
    for i, value in enumerate(data[:len(qubits)]):  # Limit encoding to available qubits
        circuit.append(cirq.rx(value * np.pi)(qubits[i]))
    return circuit

# Extract reservoir states
def extract_reservoir_states_sparse(circuit, qubits):
    simulator = cirq.Simulator()
    result = simulator.simulate(circuit)
    state_vector = csr_matrix(result.final_state_vector)  # Store as sparse matrix
    return np.abs(state_vector.toarray()) # Take the absolute value of the state vector

# Process training and testing data through the quantum reservoir
def process_with_quantum_reservoir_in_batches(data, quantum_circuit, quantum_qubits, batch_size=100):
    reservoir_features = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        for sample in batch:
            encoded_circuit = encode_classical_data_optimized(sample.flatten(), quantum_qubits)
            full_circuit = quantum_circuit + encoded_circuit
            state_vector = extract_reservoir_states_sparse(full_circuit, quantum_qubits)
            reservoir_features.append(state_vector.reshape(-1))
    return csr_matrix(reservoir_features)

reservoir_features_train = process_with_quantum_reservoir_in_batches(X_train, quantum_circuit, quantum_qubits, batch_size=50)
del X_train  # Clear training data after processing
gc.collect()

reservoir_features_test = process_with_quantum_reservoir_in_batches(X_test, quantum_circuit, quantum_qubits, batch_size=50)
del X_test  # Clear testing data after processing
gc.collect()

# Reshape reservoir_features_train and reservoir_features_test to 2D
reservoir_features_train = reservoir_features_train.reshape(reservoir_features_train.shape[0], -1)
reservoir_features_test = reservoir_features_test.reshape(reservoir_features_test.shape[0], -1)

# Train a Logistic Regression Model

# Calculate class weights before training
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))

clf = SGDClassifier(loss="log_loss", class_weight=class_weights_dict) # Change 'loss' to 'log_loss'
for i in range(0, reservoir_features_train.shape[0], 100):  # Incremental training
    batch_features = reservoir_features_train[i:i+100]
    batch_labels = y_train[i:i+100]
    clf.partial_fit(batch_features, batch_labels, classes=np.unique(y_train))

# Evaluate the Model
y_pred_train = clf.predict(reservoir_features_train)
y_pred_test = clf.predict(reservoir_features_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Perform cross-validation
scores = cross_val_score(clf, reservoir_features_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores: {scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%")

print(f"Train Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Clear reservoir features after model evaluation
del reservoir_features_train, reservoir_features_test
gc.collect()

conf_matrix = confusion_matrix(y_test, y_pred_test)

# Example Predictions
print(f"Predicted Defaults (Test): {y_pred_test[:10]}")
print(f"Actual Defaults (Test): {y_test[:10]}") # Remove .values

           name     role         type     demographic  \
0    Attribute1  Feature  Categorical            None   
1    Attribute2  Feature      Integer            None   
2    Attribute3  Feature  Categorical            None   
3    Attribute4  Feature  Categorical            None   
4    Attribute5  Feature      Integer            None   
5    Attribute6  Feature  Categorical            None   
6    Attribute7  Feature  Categorical           Other   
7    Attribute8  Feature      Integer            None   
8    Attribute9  Feature  Categorical  Marital Status   
9   Attribute10  Feature  Categorical            None   
10  Attribute11  Feature      Integer            None   
11  Attribute12  Feature  Categorical            None   
12  Attribute13  Feature      Integer             Age   
13  Attribute14  Feature  Categorical            None   
14  Attribute15  Feature  Categorical           Other   
15  Attribute16  Feature      Integer            None   
16  Attribute17  Feature  Categ

In [18]:
#print(X_reduced.explained_variance_)
#print(X_reduced.explained_variance_ratio_)
#print(X_reduced.explained_variance_ratio_.cumsum())
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())

[0.28685946 0.20729333 0.17171381 0.15225146 0.13713935 0.13020936
 0.12331787 0.11255059 0.09129953 0.07951245 0.06870252 0.0574237
 0.05389707 0.04970232 0.03894801 0.03252589 0.03046041 0.02768998]
[0.15234935 0.11009225 0.09119618 0.08085984 0.07283389 0.06915342
 0.06549338 0.05977495 0.04848863 0.04222859 0.0364875  0.03049738
 0.02862441 0.02639661 0.02068505 0.01727431 0.01617734 0.01470598]
[0.15234935 0.2624416  0.35363778 0.43449763 0.50733152 0.57648493
 0.64197832 0.70175327 0.7502419  0.79247049 0.82895799 0.85945537
 0.88807979 0.91447639 0.93516145 0.95243575 0.9686131  0.98331908]


In [19]:
results = {'y_pred_train' : y_pred_train,
           'y_pred_test' : y_pred_test,
           'train_accuracy' : train_accuracy,
           'test_accuracy' : test_accuracy}

with open('data.pkl', 'wb') as file:
    pickle.dump(results, file)

In [20]:
recall = recall_score(y_test, y_pred_test, average="binary", pos_label=1)  # Adjust `pos_label` as per your dataset
print(f"Recall Score: {recall:.2f}")


Recall Score: 0.46


In [21]:
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[65 76]
 [13 46]]
