# Import and Processing Data

In [1]:
import logging
from abc import ABC, abstractmethod
import numpy as np 
import pandas as pd 
from typing import Union, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Kinda useless ngl ._.
# logging.basicConfig(filename='preprocessing.log',
#                     filemode='a',
#                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                     datefmt='%H:%M:%S',
#                     level=logging.DEBUG)

logger = logging.getLogger(__name__)


# Abstract class is a strategy for handling data
class DataStrategy(ABC):
    """Abstract class defining strategy for handling data

    Args:
        ABC (_type_): _description_
    """
    @abstractmethod
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        pass 
    
    
class DataPreProcessStrategy(DataStrategy):
    """Inherit the datastrategy and overwrite the handle_data method provided by the DataStrategy above"""
    
    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame: 
        """Preprocess the dataframe

        Args:
            data (pd.DataFrame): DataFrame that need to be preprocessed.
        """
        # Drop useless coloumns
        logger.info("Begin to preprocessing the dataframe ...")
        try: 
            logger.info("1. Start dropping useless columns")
            data = data.drop(columns=[
                "Account length", 
                "State", 
                "Area code"
            ])
            logger.info("Delete useless columns complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when dropping columns")
            raise e
        
        # Convert data type.
        try: 
            # Converting object column to category
            logging.info("2. Converting data to it correct data types")
            for i in data.select_dtypes(include='object').columns.to_list(): 
                data[i] = data[i].astype('category')
            # Converting target column to category 
            data['Churn'] = data['Churn'].astype('category')
            logging.info("Converting data type complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when convert data type")
            raise e
        
        # Handling null value.
        try: 
            if data.isnull().sum().any()==True:
                logging.info ("3. Handling null values")   
                for i in data.select_dtypes(include=['int64', 'float64']).columns.to_list():
                    data[i].fillna(data[i].mean(), inplace=True)
                data = data.dropna()
            else: 
                logging.info("3. The data had no missing values")
        except Exception as e: 
            logger.exception(f"Encounting an exception when handling null value")
        
        # Scale if needed. 
        # Identify numerical columns
        logging.info('4. Encoding the values')
        num_col = data.select_dtypes(include=['int64', 'float64']).columns.values.tolist()

        # Identify categorical columns 
        cat_col = data.select_dtypes(include='category').columns.values.tolist()

        # Encoding the data
        numeric_transformer = Pipeline(
            steps=[("Scaler", StandardScaler())]
        )

        categorical_transformer = Pipeline(
            steps=[('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_col),
                ('cat', categorical_transformer, cat_col)
            ]
        )

        encoded_data = preprocessor.fit_transform(data)
        
        # Create new names
        # Keep OG names for num_col
        new_num_col = num_col
        
        # Change name for cat_col 
        new_cat_col = preprocessor.named_transformers_['cat'].named_steps['OneHotEncoder'].get_feature_names_out(cat_col)
        
        # Combine to have new col names
        columns = list(new_num_col) + list(new_cat_col)
        
        encoded_data = pd.DataFrame(encoded_data, columns=columns)
        return encoded_data
    

class DataDivideStrategy(DataStrategy): 
    """Split the data into the dataframes for training and testing process. 

    Args:
        Dataframe (pd.DataFrame): Take in the already encoded dataframe
    """
    
    def handle_data(self, data) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        data = pd.DataFrame(data)
        X = data.iloc[:, :-2]  # All columns except the last two
        y = data.iloc[:, -2:]  # The last two columns

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

        return X_train, y_train, X_valid, y_valid
    
class DataCleaning(DataStrategy):
    """
    Data cleaning class which preprocesses the data and divides it into train and test data.
    """

    def __init__(self, data: pd.DataFrame, strategy: DataStrategy) -> None:
        """Initializes the DataCleaning class with a specific strategy."""
        self.df = data
        self.strategy = strategy

    def handle_data(self):
        """Handle data based on the provided strategy"""
        return self.strategy.handle_data(self.df)
    


In [2]:
import pandas as pd 
import numpy as np  
import logging 
from zenml import step 
from typing import Tuple, Annotated

def clean_data(df: pd.DataFrame) -> Tuple[
    Annotated[pd.DataFrame, 'X_train'],
    Annotated[pd.DataFrame, 'X_valid'],
    Annotated[pd.DataFrame, 'y_train'],
    Annotated[pd.DataFrame, 'y_valid']
]:
    try: 
        process_strategy = DataPreProcessStrategy()
        data_cleaning = DataCleaning(df, process_strategy)
        cleaned_data = data_cleaning.handle_data()
        
        divide_strategy = DataDivideStrategy()
        data_dividing = DataCleaning(cleaned_data, divide_strategy)
        dataframe = data_dividing.handle_data()
        return dataframe
    except Exception as e:
        logging.error(f"Error cleaning data: {e}")
        raise e 


In [3]:
# Read and transform the data
dataframe = pd.read_csv('../data/telecom_churn.csv')
X_train, y_train, X_test, y_test = clean_data(dataframe)

[1;35mBegin to preprocessing the dataframe ...[0m
[1;35m1. Start dropping useless columns[0m


[1;35mDelete useless columns complete[0m
[1;35m2. Converting data to it correct data types[0m
[1;35mConverting data type complete[0m
[1;35m3. The data had no missing values[0m
[1;35m4. Encoding the values[0m


In [4]:
X_train

Unnamed: 0,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,International plan_No,International plan_Yes,Voice mail plan_No,Voice mail plan_Yes
0,1.234883,1.566767,0.476643,1.567036,-0.070610,-0.055940,-0.070427,0.866743,-0.465494,0.866029,-0.085008,-0.601195,-0.085690,-0.427932,1.0,0.0,0.0,1.0
1,1.307948,-0.333738,1.124503,-0.334013,-0.108080,0.144867,-0.107549,1.058571,0.147825,1.059390,1.240482,-0.601195,1.241169,-0.427932,1.0,0.0,0.0,1.0
2,-0.591760,1.168304,0.675985,1.168464,-1.573383,0.496279,-1.573900,-0.756869,0.198935,-0.755571,0.703121,0.211534,0.697156,-1.188218,1.0,0.0,1.0,0.0
3,-0.591760,2.196596,-1.466936,2.196759,-2.742865,-0.608159,-2.743268,-0.078551,-0.567714,-0.078806,-1.303026,1.024263,-1.306401,0.332354,0.0,1.0,1.0,0.0
4,-0.591760,-0.240090,0.626149,-0.240041,-1.038932,1.098699,-1.037939,-0.276311,1.067803,-0.276562,-0.049184,-0.601195,-0.045885,1.092641,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,-0.591760,0.993861,-0.221052,0.993481,0.353401,0.546480,0.354165,-0.355416,0.454484,-0.355664,-0.264128,-0.601195,-0.258182,-1.188218,1.0,0.0,1.0,0.0
2662,-0.591760,-1.472204,0.376972,-1.472482,2.124387,0.094665,2.124458,-0.784556,-1.947682,-0.786333,1.240482,0.617898,1.241169,0.332354,0.0,1.0,1.0,0.0
2663,-0.591760,-0.183166,1.124503,-0.182793,-0.352626,-0.306949,-0.353488,3.839081,-1.436583,3.836763,0.882241,1.024263,0.882917,0.332354,1.0,0.0,1.0,0.0
2664,1.307948,0.349342,0.725820,0.349717,-0.678030,-1.662395,-0.678312,-0.046909,0.914473,-0.048044,1.491250,-0.601195,1.493272,-0.427932,0.0,1.0,0.0,1.0


In [5]:
X_test

Unnamed: 0,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,International plan_No,International plan_Yes,Voice mail plan_No,Voice mail plan_Yes
2666,-0.591760,1.397833,1.373680,1.397454,-1.184871,-0.407352,-1.184110,2.136365,1.527792,2.136062,0.667297,-1.007560,0.670619,-0.427932,0.0,1.0,1.0,0.0
2667,1.381014,0.145520,-0.819076,0.145570,0.394816,2.353742,0.395928,-1.091084,0.505594,-1.089559,-0.085008,-0.194831,-0.085690,1.092641,1.0,0.0,0.0,1.0
2668,0.942620,0.296091,-0.470229,0.295710,0.254794,-0.808966,0.254398,-2.228206,0.556704,-2.227754,-0.586545,-0.194831,-0.589897,-1.188218,1.0,0.0,0.0,1.0
2669,-0.591760,-0.925006,-0.171217,-0.924850,-0.307267,0.044463,-0.307084,0.075702,0.301154,0.075004,-3.273349,-1.007560,-3.270152,0.332354,1.0,0.0,1.0,0.0
2670,0.285029,0.756987,0.376972,0.756930,-0.985684,0.897892,-0.984575,-0.432542,-1.027703,-0.430372,-0.443249,0.617898,-0.443942,0.332354,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,2.038605,-0.432895,-1.167924,-0.433386,0.286348,1.299506,0.286880,1.547039,-0.874374,1.547188,-0.120832,0.617898,-0.125496,0.332354,1.0,0.0,0.0,1.0
3329,-0.591760,0.942447,-2.164631,0.942714,-0.938353,-2.264816,-0.938172,-0.189297,1.170023,-0.188670,-0.228304,-0.194831,-0.231645,1.092641,1.0,0.0,1.0,0.0
3330,-0.591760,0.018820,0.426808,0.019193,1.731930,-2.114211,1.732349,-0.177431,-0.465494,-0.175486,1.383778,0.617898,1.387123,0.332354,1.0,0.0,1.0,0.0
3331,-0.591760,0.624778,0.227466,0.625153,-0.816080,-0.808966,-0.815203,-1.219628,1.885562,-1.221396,-1.876211,2.243356,-1.876950,0.332354,0.0,1.0,1.0,0.0


In [6]:
y_train

Unnamed: 0,Churn_0.0,Churn_1.0
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
2661,1.0,0.0
2662,0.0,1.0
2663,1.0,0.0
2664,0.0,1.0


In [7]:
y_test

Unnamed: 0,Churn_0.0,Churn_1.0
2666,0.0,1.0
2667,1.0,0.0
2668,1.0,0.0
2669,1.0,0.0
2670,1.0,0.0
...,...,...
3328,1.0,0.0
3329,1.0,0.0
3330,1.0,0.0
3331,1.0,0.0


In [12]:
print("Distribution of y_train:")
print(y_train.value_counts(normalize=True))
print("\nDistribution of y_test:")
print(y_test.value_counts(normalize=True))


Distribution of y_train:
Churn_0.0  Churn_1.0
1.0        0.0          0.859715
0.0        1.0          0.140285
Name: proportion, dtype: float64

Distribution of y_test:
Churn_0.0  Churn_1.0
1.0        0.0          0.836582
0.0        1.0          0.163418
Name: proportion, dtype: float64


In [8]:
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.neighbors import NearestNeighbors

def compute_knn_edge_index(X, k=2):
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(X)
    
    edge_index = []
    for i, neighbors in enumerate(indices):
        for neighbor in neighbors:
            if i != neighbor:  # Avoid self-loops
                edge_index.append([i, neighbor])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return edge_index

In [71]:
import torch
from torch_geometric.data import Data, Dataset

class CustomGraphDataset(Dataset):
    def __init__(self, X, y, edge_indices):
        super(CustomGraphDataset, self).__init__()
        self.X = X
        self.y = y
        self.edge_indices = edge_indices

    def len(self):
        return len(self.X)

    def get(self, idx):
        # Get node features for this sample
        x = torch.tensor(self.X[idx], dtype=torch.float)
        
        # Get edge_index for this sample
        edge_index = torch.tensor(self.edge_indices[idx], dtype=torch.long)
        
        # Get the label for this sample
        y = torch.tensor(self.y[idx], dtype=torch.float)
        
        # Return as a Data object
        return Data(x=x, edge_index=edge_index, y=y)


In [41]:
edge_indices = compute_knn_edge_index(X_train)

In [67]:
edge_indices = [compute_knn_edge_index(X_train) for _ in range(len(X_train))]


(667, 18)

In [68]:
from torch_geometric.data import DataLoader

# Create a DataLoader
train_dataset = CustomGraphDataset(X_train, y_train, edge_indices)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)



In [72]:
# Create a Dataloader for test
edge_indices_test = [compute_knn_edge_index(X_test) for _ in range(len(X_test))]
test_dataset = CustomGraphDataset(X_test, y_test, edge_indices_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GraphClassification(nn.Module): 
    def __init__(self, input_dim, hidden_dim, output_dim, num_gcn_layers=2):
        super(GraphClassification, self).__init__()
        
        # Define multiple GCN layers
        self.gcn_layers = nn.ModuleList()
        self.gcn_layers.append(GCNConv(input_dim, hidden_dim))
        
        for _ in range(1, num_gcn_layers):
            self.gcn_layers.append(GCNConv(hidden_dim, hidden_dim))
        
        # Multi-Layer Perceptron
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x, edge_index):
        # Pass through GCN layers
        for gcn in self.gcn_layers:
            x = gcn(x, edge_index)
            x = F.relu(x)
        
        # MLP
        x = self.mlp(x)
        return x


In [None]:
import torch.optim as optim
from torch_geometric.data import DataLoader
import torch_geometric

# Instantiate your model
model = GraphClassification(input_dim=18, hidden_dim=128, output_dim=2, num_gcn_layers=3)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Assuming labels are not one-hot encoded
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Create the DataLoader (assuming you have already created train_dataset)
train_loader = torch_geometric.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        # Forward pass
        output = model(batch.x, batch.edge_index)  # Use batch.x and batch.edge_index

        # Assuming y_batch is not one-hot encoded (if it is, convert it with torch.argmax)
        y_batch_labels = torch.argmax(batch.y, dim=1)  # shape: [batch_size]

        # Compute loss
        loss = criterion(output, y_batch_labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Save the model after training
torch.save(model.state_dict(), 'model.pth')




In [None]:
model.eval()
# Sample node features (assuming 10 nodes and 18 features per node)
X_sample = torch.rand(10, 18)  # 10 nodes with 18 features each

# Sample edge index (assuming 20 edges, the shape should be [2, num_edges])
edge_index_sample = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                                  [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 2, 3, 0, 1, 4, 5, 8, 9, 6, 7]],
                                  dtype=torch.long)  # 20 edges
edge_index_sample.shape
# Forward pass
output = model(X_sample, edge_index_sample)

# Print the output
print("Edge shape:", edge_index_sample.shape)
print("Input shape:", X_sample.shape)
print("Output shape:", output.shape)  # Should be [num_nodes, output_dim] -> [10, 2]
print("Output:", output)


Edge shape: torch.Size([2, 20])
Input shape: torch.Size([10, 18])
Output shape: torch.Size([10, 2])
Output: tensor([[ 0.1542, -0.0896],
        [ 0.1540, -0.0891],
        [ 0.1545, -0.0896],
        [ 0.1527, -0.0882],
        [ 0.1794, -0.1033],
        [ 0.1794, -0.1033],
        [ 0.1653, -0.0987],
        [ 0.1642, -0.0968],
        [ 0.1661, -0.0991],
        [ 0.1644, -0.0975]], grad_fn=<AddmmBackward0>)


In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import DataLoader

# Assume GraphClassification is your GNN model class, which you've defined earlier
model = GraphClassification(input_dim=18, hidden_dim=128, output_dim=2, num_gcn_layers=3)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Use PyTorch Geometric's DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        
        
        # Forward pass
        output = model(batch.X, batch.edge_index)  # Use batch.x and batch.edge_index

        # If y_batch is one-hot encoded, convert it to class indices
        y_batch_labels = torch.argmax(batch.y, dim=1)  # Assume batch.y is your labels

        # Compute loss
        loss = criterion(output, y_batch_labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Save the model after training
torch.save(model.state_dict(), 'model.pth')




In [32]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assume GraphClassification is your GNN model class, which you've defined earlier
model = GraphClassification(input_dim=18, hidden_dim=128, output_dim=2, num_gcn_layers=3)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for X_batch, y_batch, edge_index in train_loader:
        optimizer.zero_grad()

        # Forward pass
        output = model(X_batch, edge_index)  # output shape: [batch_size, num_classes]

        # Convert y_batch from one-hot to class indices if necessary
        y_batch_labels = torch.argmax(y_batch, dim=1)  # shape: [batch_size]

        # Compute loss
        loss = criterion(output, y_batch_labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# After training, you might want to save the model
torch.save(model.state_dict(), 'model.pth')


In [3]:
import torch
import torch.nn as nn
from abc import ABC, abstractmethod

class ClassificationModel(ABC, nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()

    @abstractmethod
    def forward(self, x):
        """
        Forward pass of the model.
        """
        pass

    @abstractmethod
    def train_model(self, dataloader, criterion, optimizer, num_epochs):
        """
        Method to train the model.
        """
        pass

    @abstractmethod
    def evaluate_model(self, dataloader, criterion):
        """
        Method to evaluate the model.
        """
        pass

    @abstractmethod
    def predict(self, x):
        """
        Method to make predictions.
        """
        pass

    def save_model(self, path):
        """
        Save the model to a file.
        """
        torch.save(self.state_dict(), path)

    def load_model(self, path):
        """
        Load the model from a file.
        """
        self.load_state_dict(torch.load(path))


In [4]:
class MyCustomModel(ClassificationModel):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MyCustomModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

    def train_model(self, dataloader, criterion, optimizer, num_epochs):
        for epoch in range(num_epochs):
            for inputs, labels in dataloader:
                optimizer.zero_grad()
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    def evaluate_model(self, dataloader, criterion):
        total_loss = 0.0
        total_correct = 0
        total_samples = 0
        with torch.no_grad():
            for inputs, labels in dataloader:
                outputs = self(inputs)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_samples += labels.size(0)
                total_correct += (predicted == labels).sum().item()
        accuracy = 100.0 * total_correct / total_samples
        print(f'Loss: {total_loss:.4f}, Accuracy: {accuracy:.2f}%')
    
    def predict(self, x):
        with torch.no_grad():
            outputs = self(x)
            _, predicted = torch.max(outputs, 1)
        return predicted

# Now you can instantiate the custom model class
model = MyCustomModel(input_size=784, hidden_size=128, num_classes=10)


In [5]:
model

MyCustomModel(
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)