In [None]:
'''
Logs
26/9 12:00am - Split Animal ID instead deleting it
25/9 11:43pm - Finished basic implementation
'''

# Preprocessing Function for dataset before applying ML

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Important note: preprocessing doesn't delete the Animal ID, you have to manually delete later on, reason is to preserve it
# as a variable for scoring
def preprocessing(df, name_mapping_var=None):

  # For Name, we will use label encoding to assign each unique name a specific int, at the same time return the mapping
  # Strip leading asterisks from the Name column
  df['Name'] = df['Name'].str.lstrip('*')
  label_encoder = LabelEncoder()
  df['Name'] = label_encoder.fit_transform(df['Name'].astype(str))

  # If the user provided a variable to store the mapping, assign it
  if name_mapping_var is not None:
      name_mapping = {k: v for v, k in enumerate(label_encoder.classes_)}
      name_mapping_var.update(name_mapping)


  # For Animal.Type we will map Int Values to the specific animal type
  animal_mapping = {
    "Dog": 1, "Cat": 2, "Other": 3, "Bird": 4, "Livestock": 5,
    "House Rabbit": 6, "Rat": 7, "Ferret": 8, "Pig": 9, "Hamster": 10,
    "Guinea Pig": 11, "Gerbil": 12, "Hedgehog": 13, "Chinchilla": 14,
    "Goat": 15, "Mouse": 16, "Sugar Glider": 17, "Snake": 18,
    "Wildlife": 19, "Lizard": 20
    }

  df['Animal.Type'] = df['Animal.Type'].map(animal_mapping)

  # For Breed we will perform one hot encoding onto it
  # Remove parentheses and their contents, and replace '/' with space
  df['Breed'] = df['Breed'].str.replace(r'\(.*?\)', '', regex=True).str.replace('/', ' ').str.replace(',', ' ')

  # Split the 'Breed' column into a list and capitalize the first letter of each word
  df['Breed'] = df['Breed'].str.split().apply(lambda breeds: [breed.rstrip('-').capitalize() for breed in breeds])

  # Create dummy variables for each unique breed
  df_breeds = df['Breed'].str.join(' ').str.get_dummies(sep=' ')

  # Concatenate the original dataframe with the one-hot encoded breed dataframe
  df = pd.concat([df, df_breeds], axis=1)

  # Drop the original 'Breed' column
  df = df.drop(columns=['Breed'])

  # For Sex we will map Int Values to specific Sex
  sex_mapping = {'Neutered Male': 1, 'Spayed Female': 2, 'Intact Female': 3, 'Intact Male': 4, 'Unknown': 5, 'Female': 6, 'Male': 7}

  # Map the Sex column using the defined mapping
  df['Sex'] = df['Sex'].map(sex_mapping)

  # For colours, we will split into individual colours and use one hot encoding, which is assigning binary values to it

  # Split the 'Color' column by '/', 'and', and ','
  df['Color'] = df['Color'].str.replace('/', ' ').str.replace('and', ' ').str.replace(',', ' ').str.replace(r'-\b', '', regex=True)

  # Split the 'Color' column into a list and capitalize the first letter of each word
  df['Color'] = df['Color'].str.split().apply(lambda colors: [color.capitalize() for color in colors])

  # Create dummy variables for each unique color
  df_colors = df['Color'].str.join(' ').str.get_dummies(sep=' ')

  # Concatenate the original dataframe with the one-hot encoded color dataframe
  df = pd.concat([df, df_colors], axis=1)

  # Drop the original 'Color' column
  df = df.drop(columns=['Color'])

  # For Age, we will just store it as int and impute it with 0 if it is null, and store it as float
  df['Age'] = df['Age'].fillna(0).astype(float)

  # For Intake.Type, we will map Int Values to specific Intake
  intake_type_mapping = {
      'Public Assist': 1, 'Owner Surrender': 2, 'Stray': 3, 'Euthanasia Request': 4,
      'Abandoned': 5, 'Wildlife': 6, 'Moving': 7, 'Incompatible with owner lifestyle': 8,
      'Rabies Monitoring': 9, 'Marriage/Relationship split': 10, 'Owner Deceased': 11, 'Police Assist': 12,
      'Biting': 13, 'Owner Died': 14, 'TNR - Trap/Neuter/Release': 15, 'Unable to Afford': 16,
      'Unsuitable Accommodation': 17, 'Allergies': 18, 'Transfer from Other Shelter': 19,
      'Born in Shelter': 20, 'Landlord issues': 21, 'Litter relinquishment': 22, 'Sick/Injured': 23,
      'Owner requested Euthanasia': 24, 'Abuse/ neglect': 25, 'Incompatible with other pets': 26,
      'Behavioral Issues': 27, 'DOA': 28
  }

  # Map the Intake.Type column using the defined mapping
  df['Intake.Type'] = df['Intake.Type'].map(intake_type_mapping)

  # For Outcome.Type, we will map Int Values to specific Outcome
  outcome_type_mapping = {
      'Return to Owner': 1, 'Transfer': 2, 'Adoption': 3, 'Euthanasia': 4,
      'Died': 5, 'Rto-Adopt': 6, 'Disposal': 7, 'Missing': 8,
      'Stolen': 9, 'Relocate': 10, 'Lost': 11, 'Foster': 12,
      'Reclaimed': 13, 'Escaped': 14, 'Released To Wild': 15
  }

  # Map the Outcome.Type column using the defined mapping
  df['Outcome.Type'] = df['Outcome.Type'].map(outcome_type_mapping)

  # For Date and Time, we will be using panda and numpy date conversion

  # Convert Intake.Date and Outcome.Date to datetime format
  df['Intake.Date'] = pd.to_datetime(df['Intake.Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
  df['Outcome.Date'] = pd.to_datetime(df['Outcome.Date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')


  # Extract date components from the date columns
  df['Intake.Day'] = df['Intake.Date'].dt.day.fillna(0).astype(int)
  df['Intake.Month'] = df['Intake.Date'].dt.month.fillna(0).astype(int)
  df['Intake.Year'] = df['Intake.Date'].dt.year.fillna(0).astype(int)

  df['Outcome.Day'] = df['Outcome.Date'].dt.day.fillna(0).astype(int)
  df['Outcome.Month'] = df['Outcome.Date'].dt.month.fillna(0).astype(int)
  df['Outcome.Year'] = df['Outcome.Date'].dt.year.fillna(0).astype(int)

  # Extract and convert the hour to radians
  df['Intake.Hour'] = df['Intake.Date'].dt.hour.fillna(0).astype(int)
  df['Outcome.Hour'] = df['Outcome.Date'].dt.hour.fillna(0).astype(int)

  df['Intake.Hour.Radians'] = (df['Intake.Hour'] / 24) * 2 * np.pi
  df['Outcome.Hour.Radians'] = (df['Outcome.Hour'] / 24) * 2 * np.pi

  # Drop original date columns if no longer needed
  df = df.drop(columns=['Intake.Date', 'Outcome.Date'])

  return df


optipaw_data = pd.read_csv('optipaw_FINAL.csv')
optipaw_data = preprocessing(optipaw_data)

# # Example usage to get name dict
# optipaw_name = {} # define a dictionary
# optipaw_data = preprocessing(optipaw_data, optipaw_name)
# print(optipaw_name)

# Set options to display all unique values
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

# # Print dtypes, unique and missing value checks before splitting
# print(optipaw_data.dtypes)
# print(optipaw_data.shape)
# print(optipaw_data.nunique())
# print(optipaw_data.isnull().sum())

# Split the dataset into training (Austin) and also test (non Austin)

train_data = optipaw_data[optipaw_data['State'] == 'Austin'].copy()
test_data = optipaw_data[optipaw_data['State'] != 'Austin'].copy()

# Reset the index for both datasets
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Drop 'State' column from both train and test datasets
train_data = train_data.drop(columns=['State'])
test_data = test_data.drop(columns=['State'])

# print(train_data.head(5))
# print(test_data.head(5))

# train_data.head(1000).to_csv('a.csv', index=False)
# test_data.head(1000).to_csv('b.csv', index=False)

# Note that the preprocessing doesn't delete the Animal ID, reason is to preserve it for scoring
# Extract Animal.ID from the train_data and test_data
train_ids = train_data['Animal.ID']
test_ids = test_data['Animal.ID']

# Drop 'Animal.ID' from train_data and test_data
train_data = train_data.drop(columns=['Animal.ID'])
test_data = test_data.drop(columns=['Animal.ID'])

# print(train_data.head(5))
# print(test_data.head(5))
# print(train_ids.head(5))
# print(test_ids.head(5))


# # Additional print dtypes, unique and missing values for both train and test dataset
# print(train_data.dtypes)
# print(train_data.shape)
# print(train_data.nunique())
# print(train_data.isnull().sum())

# print(test_data.dtypes)
# print(test_data.shape)
# print(test_data.nunique())
# print(test_data.isnull().sum())

# # Sample Output for viewing
# train_data.head(1000).to_csv('train.csv', index=False)
# test_data.head(1000).to_csv('test.csv', index=False)


# Example Split
x_train = train_data.drop(columns=['Outcome.Type'])
y_train = train_data['Outcome.Type']
train_id = train_ids

x_test = test_data.drop(columns=['Outcome.Type'])
y_test = test_data['Outcome.Type']
test_id = test_ids

# print("train")
# print(train_id.head(10))
# print(y_train.head(10))
# print("test")
# print(test_id.head(10))
# print(y_test.head(10))


# # Build the Random Forest model - to check whether preprocess works and can fit into a ML
# from sklearn.ensemble import RandomForestClassifier

# rftrain_model = RandomForestClassifier()
# rftrain_model.fit(x_train, y_train)

# rftest_model = RandomForestClassifier()
# rftest_model.fit(x_test, y_test)

'''
Final suggestion:
When training data, use x train and y train without the Animal ID (this ensures Animal ID won't be processed)
When predicting data, feed in x test and OBTAIN y pred (it will follow the order)
Lastly, combine y pred with the test id (exam); same goes to y test and test id (mark scheme)
'''


"\nFinal suggestion:\nWhen training data, use x train and y train without the Animal ID (this ensures Animal ID won't be processed)\nWhen predicting data, feed in x test and OBTAIN y pred (it will follow the order)\nLastly, combine y pred with the test id (exam); same goes to y test and test id (mark scheme)\n"

In [None]:
'''
Logging
5/10 20.11 - Updated to deal with empty predictions (when some classes can't be predicted)
5/10 10:35 - Updated formatting to be numeric
36/9 13:02 - Finished basic implementation
'''

# Scoring function Involving Accuracy, Precision and Recall, F1, Log loss

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

def scoring_function(y_pred_df, y_test_df, all_classes=None):
    '''
    Calculates various performance metrics for the predictions.

    Parameters:
    y_pred_df: DataFrame containing predictions.
    y_test_df: DataFrame containing true labels.
    all_classes: List of all possible classes.

    Returns:
    metrics: Dictionary containing accuracy, precision, recall, f1 score, and log loss.
    '''

    # Merge the prediction and actual outcome dataframes using Animal ID
    df_combined = pd.merge(y_pred_df, y_test_df, on='Animal.ID')

    if df_combined.empty:
        print("No data in the combined DataFrame.")
        return None

    # Convert the predicted probabilities into predicted classes (argmax)
    y_pred_class = df_combined.iloc[:, 1:-1].idxmax(axis=1).astype(int)
    y_true = df_combined['Outcome.Type'].astype(int)

    # If all_classes is None, use unique classes from y_true
    if all_classes is None:
        all_classes = np.unique(y_true)  # Get unique classes from y_true

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred_class)

    # Calculate metrics only for predicted classes
    unique_pred_classes = np.unique(y_pred_class)
    precision = precision_score(y_true, y_pred_class, average='weighted', zero_division=0, labels=unique_pred_classes)
    recall = recall_score(y_true, y_pred_class, average='weighted', zero_division=0, labels=unique_pred_classes)
    f1 = f1_score(y_true, y_pred_class, average='weighted', zero_division=0, labels=unique_pred_classes)

    # Ensure that y_pred_probs contains probabilities for all classes
    y_pred_probs = df_combined.iloc[:, 1:-1].values

    # Create an array to ensure all classes are represented in predictions
    complete_probs = np.zeros((y_pred_probs.shape[0], len(all_classes)))  # Initialize with zeros
    for i, class_label in enumerate(all_classes):
        if class_label in df_combined.columns:
            complete_probs[:, i] = y_pred_probs[:, class_label - 1]  # Fill with actual probabilities

    # Calculate log loss with all classes present in the labels
    logloss = log_loss(y_true, complete_probs, labels=all_classes)

    # Print the metrics
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Log Loss: {logloss:.4f}")

    # Return the metrics as a dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'log_loss': logloss
    }

  '''
  y_pred_df :
  Animal ID   1 2 .... 15
  101010      0.1 0.2.... 0.0

  y_true_df:
  Animal ID   Outcome
  10129109    2
  '''



# # Testing

# # Example mappings
# outcome_type_mapping = {
#     'Return to Owner': 1, 'Transfer': 2, 'Adoption': 3, 'Euthanasia': 4,
#     'Died': 5, 'Rto-Adopt': 6, 'Disposal': 7, 'Missing': 8,
#     'Stolen': 9, 'Relocate': 10, 'Lost': 11, 'Foster': 12,
#     'Reclaimed': 13, 'Escaped': 14, 'Released To Wild': 15
# }

# # Inverse mapping for easier reference later
# inv_outcome_type_mapping = {v: k for k, v in outcome_type_mapping.items()}

# # Testing variables
# train_id = train_id.head(100000)
# x_train = x_train.head(100000)
# y_train = y_train.head(100000)

# test_id = test_id.head(10)
# x_test = x_test.head(10)
# y_test = y_test.head(10)

# # Test Scoring Function

# from sklearn.ensemble import RandomForestClassifier
# import pandas as pd

# # Initialize Random Forest model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# # Fit the model
# rf_model.fit(x_train, y_train)

# # Get predicted probabilities
# y_pred_probs = rf_model.predict_proba(x_test)

# # Print the shape of the predicted probabilities
# print("Predicted probabilities shape:", y_pred_probs.shape)

# # Create a DataFrame initialized to zero for all possible classes (1 to 15)
# all_classes = list(range(1, 16))
# predicted_df = pd.DataFrame(0.0, index=range(len(y_pred_probs)), columns=[inv_outcome_type_mapping[i] for i in all_classes])

# # Fill in the predicted probabilities where applicable
# for idx, class_probs in enumerate(y_pred_probs):
#     for class_idx in range(len(class_probs)):
#         class_label = class_idx + 1  # Adjust for 1-based class labels
#         predicted_df.at[idx, inv_outcome_type_mapping[class_label]] = class_probs[class_idx]

# # Add Animal IDs as the first column in the predicted DataFrame
# predicted_df.insert(0, 'Animal.ID', test_id.reset_index(drop=True))

# # Combine actual outcomes with Animal IDs
# actual_df = pd.DataFrame({
#     'Animal.ID': test_id.reset_index(drop=True),
#     'Outcome.Type': y_test.reset_index(drop=True).map(inv_outcome_type_mapping)
# })

# # Display both DataFrames for comparison
# print("Predicted Probabilities with Animal IDs:")
# print(predicted_df.head())

# print("\nActual Outcomes with Animal IDs:")
# print(actual_df.head())

# # Prediction conversion from probability to predicted class labels (for debugging)
# # Calculate predicted class labels based on the highest probability
# predicted_classes = y_pred_probs.argmax(axis=1) + 1  # Get the index of the max probability, adjusting for 1-based class labels

# # Map the predicted classes to their corresponding outcome types
# predicted_labels = [inv_outcome_type_mapping[cls] for cls in predicted_classes]

# # Create a new DataFrame for predicted outcomes
# predicted_outcomes_df = pd.DataFrame({
#     'Animal.ID': test_id.reset_index(drop=True),
#     'Predicted.Outcome.Type': predicted_labels
# })

# # Display the predicted outcomes DataFrame
# print("Predicted Outcomes with Animal IDs:")
# print(predicted_outcomes_df.head())

# # Calculate and display the metrics
# metrics = scoring_function(predicted_df, actual_df)


In [None]:
'''
Logging
29/7/24 12:30 - Done basic implementation for PCA
'''

# PCA function

# Remember to strip off Animal ID before doing so

from sklearn.decomposition import PCA
import pandas as pd

def pca_combined(x_train, x_test, n_components=50):
    """
    Apply PCA on the combined x_train and x_test datasets to ensure consistent components.

    Parameters:
    - x_train: The training dataset (must be numerical).
    - x_test: The testing dataset (must be numerical).
    - n_components: The number of principal components to keep.

    Returns:
    - x_train_pca: PCA-transformed training data.
    - x_test_pca: PCA-transformed testing data.
    """
    # Combine the training and test datasets
    combined_data = pd.concat([x_train, x_test], axis=0)

    # Fit PCA on the combined data
    pca = PCA(n_components=n_components, random_state=42)
    combined_pca = pca.fit_transform(combined_data)

    # Split the transformed data back into train and test sets
    x_train_pca = combined_pca[:x_train.shape[0], :]
    x_test_pca = combined_pca[x_train.shape[0]:, :]

    # Convert to DataFrames with appropriate column names
    x_train_pca_df = pd.DataFrame(x_train_pca, columns=[f'pca_{i}' for i in range(n_components)])
    x_test_pca_df = pd.DataFrame(x_test_pca, columns=[f'pca_{i}' for i in range(n_components)])

    return x_train_pca_df, x_test_pca_df


# # Testing

# x_train_pca = x_train.head(100)
# x_test_pca = x_test.head(100)

# # Apply PCA on the combined x_train and x_test to ensure consistency
# x_train_pca, x_test_pca = pca_combined(x_train_pca, x_test_pca, n_components=50)

# # Sample Output for viewing
# x_train_pca.to_csv('x_train_pca.csv', index=False)
# x_test_pca.to_csv('x_test_pca.csv', index=False)

# Just use it as your new x train and x test for prediction