# Data cleaning


In [None]:
pip install -r requirements.txt

In [None]:
import pandas as pd
from Library.data_preprocessing_ML import clean_extract_domain, extract_domain_features, load_data, get_core_domain, preprocess_domains, compare_domains, create_abbreviation, get_domain_without_tld, check_words_in_url, check_abbreviation_in_url

# Load data and preprocess domains
dataset_query_path = 'Used_data_thesis/dataset_incl_query.csv'
search_results_paths = ['Used_data_thesis/search_results_DDG_2.csv']
merged_dataset = load_data(dataset_query_path, search_results_paths)

# Lowercase all URL columns before processing
url_columns = ['URL'] + [f'URL{i}' for i in range(1, 6)]
for col in url_columns:
    merged_dataset[col] = merged_dataset[col].str.lower()


# Initialize 'OfficialName_cleaned' and perform cleaning operations
merged_dataset['OfficialName_cleaned'] = merged_dataset['OfficialName'].str.lower()
merged_dataset['OfficialName'] = merged_dataset['OfficialName'].str.lower() # Lowercase the 'OfficialName' column
merged_dataset['OfficialName_cleaned'] = merged_dataset['OfficialName_cleaned'].str.replace(r"\[.*?\]", "", regex=True) # Remove text within brackets (including the brackets)
merged_dataset['OfficialName_cleaned'] = merged_dataset['OfficialName_cleaned'].str.replace(r"\(.*?\)", "", regex=True)
merged_dataset['OfficialName_cleaned'] = merged_dataset['OfficialName_cleaned'].str.replace('-', '', regex=True) # Remove hyphens from 'OfficialName'
merged_dataset['Abbreviation'] = merged_dataset.apply(
    lambda row: create_abbreviation(row['OfficialName_cleaned']) if pd.isna(row['Abbreviation']) else row['Abbreviation'],
    axis=1
    ) # Abbreviation creation if none is given


# Define the URL columns you are working with, e.g., 'URL', 'URL1', 'URL2', etc.
url_columns = ['URL'] + [f'URL{i}' for i in range(1, 6)]
url_columns_without_official =[f'URL{i}' for i in range(1, 6)]
# Preprocess the domains to extract the core domain
preprocess_domains(merged_dataset, url_columns)

# Extract and append domain features for each URL column with a suffix indicating the column
for col in url_columns:
    # Extract clean domain first
    merged_dataset[f'{col}_clean_domain'] = merged_dataset[col].apply(clean_extract_domain)

# Compare domains and add the comparison results to the DataFrame
comparison_domain_cols = [f'{col}_clean_domain' for col in url_columns if col != 'URL']
merged_dataset['domain_matches'] = merged_dataset.apply(
    lambda row: compare_domains(row, 'URL_clean_domain', comparison_domain_cols), axis=1
)

# Calculate the length of each 'URL(i)_domain'
for col in url_columns_without_official:
    domain_col = f'{col}_domain'
    if domain_col in merged_dataset.columns:
        # Apply the extraction of domain without TLD
        merged_dataset[f'{col}_core_domain'] = merged_dataset[domain_col].apply(get_domain_without_tld)
        # Calculate the length of the domain without TLD
        merged_dataset[f'{col}_domain_length'] = merged_dataset[f'{col}_core_domain'].str.len()

merged_dataset['Abbreviation'] = merged_dataset['Abbreviation'].str.lower() # Lowercase the 'OfficialName' column

for col in url_columns:
    if f'{col}_clean_domain' in merged_dataset.columns:
        # Check if any word from 'OfficialName' is in the URL
        merged_dataset[f'{col}_has_official_word'] = merged_dataset.apply(
            lambda row: check_words_in_url(row['OfficialName_cleaned'], row[col]), axis=1).astype(int)

        # Check if 'Abbreviation' is in the URL
        merged_dataset[f'{col}_has_abbreviation'] = merged_dataset.apply(
            lambda row: check_abbreviation_in_url(row['Abbreviation'], row[col]), axis=1).astype(int)

merged_dataset['OfficialName_cleaned'] = merged_dataset['OfficialName_cleaned'].str.replace(' ', '', regex=True) # Remove spaces
# Calculate the length of 'OfficialName_cleaned'
merged_dataset['OfficialName_cleaned_length'] = merged_dataset['OfficialName_cleaned'].str.len()

# Calculate the length of 'Abbreviation'
merged_dataset['Abbreviation_length'] = merged_dataset['Abbreviation'].str.len()
print(merged_dataset.columns)

## Print statements for checking


In [None]:
# Display results and optionally save the updated dataset
print(merged_dataset[['OfficialName','Abbreviation','URL_domain','URL1_domain', 'URL1_clean_domain','OfficialName_cleaned_length','Abbreviation_length','URL', 'URL1','URL2','URL3','URL4','URL5', 'URL1_domain_length', 'URL1_has_official_word','URL1_has_abbreviation', 'domain_matches']].head(42))
print(merged_dataset.info())

## Extract tld for deep cleaning domains

In [None]:
from Library.data_preprocessing_ML import extract_tlds

# List to collect all TLDs
all_tlds = set()

# Columns to check
url_columns = [f'URL{i}_clean_domain' for i in range(1, 6)]

# Loop through each column, apply the function, and update the set of TLDs
for col in url_columns:
    # Apply the extract_tlds function to the column
    current_tlds = merged_dataset[col].apply(extract_tlds)
    # Drop None values and update all_tlds set
    all_tlds.update(tld for tld in current_tlds if tld is not None)

# Print the unique TLDs found
print("Unique TLDs found in URL columns:", all_tlds)


### Count missing values and replace with NaN

In [None]:
print(merged_dataset['OfficialName_cleaned'].isna().sum())
print(merged_dataset['Abbreviation'].isna().sum())

In [None]:
# Calculating the number of missing values in each column
missing_values_count = merged_dataset.isna().sum()

# Filtering and printing only the columns that have missing values
columns_with_na = missing_values_count[missing_values_count > 0]
print(columns_with_na)

# Calculating the percentage of missing values in each column
total_rows = len(merged_dataset)
missing_percentage = (merged_dataset.isna().sum() / total_rows) * 100

# Displaying the percentage of missing values for each column
print(missing_percentage)

In [7]:
merged_dataset = merged_dataset.fillna('NaN')

In [None]:
import pandas as pd

# Define the column for the official URL clean domain
true_url_domain = 'URL_clean_domain'

# List of scraped URL clean domain columns to compare against the official URL clean domain
scraped_url_domains = [f'URL{i}_clean_domain' for i in range(1, 6)]

# Initialize a dictionary to store the match counts for each scraped URL domain
matches_count = {}
no_matches_count = 0  # Counter for entries where none of the URLs match

# Loop through each scraped URL domain and compare it with the true URL domain
for col in scraped_url_domains:
    # Calculate the number of matches where the scraped URL domain equals the official URL domain
    matches_count[col] = (merged_dataset[col] == merged_dataset[true_url_domain]).sum()

# Calculate the number of entries where none of the URLs match the true URL
no_matches_count = ((merged_dataset[scraped_url_domains] != merged_dataset[true_url_domain].values[:, None]).all(axis=1)).sum()

# Output the match counts
print("Match counts for each scraped URL domain:")
for url, count in matches_count.items():
    print(f"{url}: {count}")

# Output the count of no matches
print(f"Number of entries where none of the URLs match the true URL: {no_matches_count}")


## Remove tld for better comparision between scraped and true URL

In [None]:
import re
import pandas as pd

# List of TLDs to exclude
tlds = all_tlds

# Compile regex pattern outside the function for efficiency
tld_pattern = r'\.(' + '|'.join([re.escape(tld.strip('.')) for tld in tlds]) + ')$'

def extract_until_tld(url):
    """Removes the last TLD from the URL if it matches the predefined list."""
    if pd.isna(url):
        return ""  # Handle NaN values
    # Replace the last occurrence of TLD
    url = re.sub(tld_pattern, "", url)
    return url


url_columns = [f'URL{i}_clean_domain' for i in range(1, 6)]


for col in url_columns:
    merged_dataset[f'{col}_before_tld'] = merged_dataset[col].apply(extract_until_tld)
print(merged_dataset[['URL1_clean_domain_before_tld','URL2_clean_domain_before_tld',
                      'URL3_clean_domain_before_tld','URL4_clean_domain_before_tld',
                      'URL5_clean_domain_before_tld']].head(10))


# Feature engineering

## Calculate jaccard similarity, check subsequence, sequence match score and levenshtein distance

In [None]:
%pip install pandarallel
%pip install python-Levenshtein

import pandas as pd
from difflib import SequenceMatcher
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def jaccard_similarity(set1, set2):
    """ Calculate Jaccard similarity score. """
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

def check_subsequence(company, url):
    """ Check if all characters in the company name can be found in sequence in the URL. """
    it = iter(url)
    return all(char in it for char in company)

def sequence_match_score(a, b):
    """ Use SequenceMatcher to find how similar two strings are. """
    return SequenceMatcher(None, a, b).ratio()

def levenshtein_distance_score(a, b):
    """ Calculate the Levenshtein distance between two strings. """
    return Levenshtein.distance(a, b)

def safe_set_conversion(text):
    """ Convert a text string to a set of characters, handling None safely. """
    if not text:
        return set()
    return set(text)

def cosine_similarity_score(a, b):
    """ Calculate the cosine similarity between two strings. """
    if not a or not b:
        return 0.0

    if len(a) < 3 or len(b) < 3:
        return 0.0  # Strings are too short for meaningful comparison

    vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
    vectors = vectorizer.fit_transform([a, b])

    if vectors.shape[1] == 0:
        return 0.0  # No features extracted, possibly due to containing only stop words

    similarity = cosine_similarity(vectors)
    return similarity[0, 1]


def hamming_distance_score(a, b):
    """ Calculate the Hamming distance between two strings. """
    return sum(ch1 != ch2 for ch1, ch2 in zip(a, b))

def ngram_overlap_score(a, b):
    """ Calculate the n-gram overlap between two strings. """
    n = 3  # Adjust n-gram size as needed
    a_ngrams = set([a[i:i+n] for i in range(len(a)-n+1)])
    b_ngrams = set([b[i:i+n] for i in range(len(b)-n+1)])

    # Check for zero denominator
    if len(a_ngrams) == 0 or len(b_ngrams) == 0:
        return 0.0

    overlap = len(a_ngrams.intersection(b_ngrams))
    return overlap / min(len(a_ngrams), len(b_ngrams))


url_columns_without_tld = [f'URL{i}_clean_domain_before_tld' for i in range(1, 6)]


# Parallel application of functions
for col in url_columns_without_tld:
    merged_dataset[f'{col}_official_jaccard'] = merged_dataset.apply(
        lambda row: jaccard_similarity(safe_set_conversion(row['OfficialName']), safe_set_conversion(row[col])), axis=1)

    merged_dataset[f'{col}_abbrev_jaccard'] = merged_dataset.apply(
        lambda row: jaccard_similarity(safe_set_conversion(row['Abbreviation']), safe_set_conversion(row[col])), axis=1)

    merged_dataset[f'{col}_official_is_subsequence'] = merged_dataset.apply(
        lambda row: check_subsequence(row['OfficialName'], row[col]), axis=1).astype(int)

    merged_dataset[f'{col}_abbrev_is_subsequence'] = merged_dataset.apply(
        lambda row: check_subsequence(row['Abbreviation'], row[col]), axis=1).astype(int)

    merged_dataset[f'{col}_official_seq_match'] = merged_dataset.apply(
        lambda row: sequence_match_score(row['OfficialName'], row[col]), axis=1)

    merged_dataset[f'{col}_abbrev_seq_match'] = merged_dataset.apply(
        lambda row: sequence_match_score(row['Abbreviation'], row[col]), axis=1)

    # Applying Levenshtein distance calculations
    merged_dataset[f'{col}_official_levenshtein'] = merged_dataset.apply(
        lambda row: levenshtein_distance_score(row['OfficialName'], row[col]), axis=1)

    merged_dataset[f'{col}_abbrev_levenshtein'] = merged_dataset.apply(
        lambda row: levenshtein_distance_score(row['Abbreviation'], row[col]), axis=1)

    merged_dataset[f'{col}_official_cosine_similarity'] = merged_dataset.apply(
        lambda row: cosine_similarity_score(row['OfficialName'], row[col]), axis=1)

    merged_dataset[f'{col}_abbrev_cosine_similarity'] = merged_dataset.apply(
        lambda row: cosine_similarity_score(row['Abbreviation'], row[col]), axis=1)

    merged_dataset[f'{col}_hamming_distance'] = merged_dataset.apply(
        lambda row: hamming_distance_score(row['OfficialName'], row[col]), axis=1)

    merged_dataset[f'{col}_ngram_overlap'] = merged_dataset.apply(
        lambda row: ngram_overlap_score(row['OfficialName'], row[col]), axis=1)

columns_to_display = [
    'EntityNumber', 'OfficialName', 'Abbreviation',
    'URL1_clean_domain_before_tld',
    'URL1_clean_domain_before_tld_official_jaccard', 'URL1_clean_domain_before_tld_abbrev_jaccard',
    'URL1_clean_domain_before_tld_official_is_subsequence', 'URL1_clean_domain_before_tld_abbrev_is_subsequence',
    'URL1_clean_domain_before_tld_official_seq_match', 'URL1_clean_domain_before_tld_abbrev_seq_match',
    'URL1_clean_domain_before_tld_official_levenshtein', 'URL1_clean_domain_before_tld_abbrev_levenshtein',
    'URL1_clean_domain_before_tld_official_cosine_similarity', 'URL1_clean_domain_before_tld_abbrev_cosine_similarity',
    'URL1_clean_domain_before_tld_hamming_distance','URL1_clean_domain_before_tld_ngram_overlap'
]
# Print the first few rows of these columns to inspect
print("Validation of DataFrame calculations:")
print(merged_dataset[columns_to_display].head(30))


contains_abbrev_subseq_count = merged_dataset['URL1_clean_domain_before_tld_abbrev_is_subsequence'].sum()
print(f"\nNumber of URLs where 'Abbreviation' is a subsequence: {contains_abbrev_subseq_count}")
contains_Official_subseq_count = merged_dataset['URL1_clean_domain_before_tld_official_is_subsequence'].sum()
print(f"Number of URLs where 'OfficialName' is a subsequence: {contains_Official_subseq_count}")

print("\nData Types and NaN Presence Check:")
print(merged_dataset[columns_to_display].info())


print("\nStatistical Summary of Computational Fields:")
print(merged_dataset[[col for col in merged_dataset.columns if 'jaccard' in col or 'seq_match' in col]].describe())

In [None]:
print(merged_dataset.columns)

In [None]:
print(len(merged_dataset))

# Data preparation, stratified splitting, scaling, training, predictiong and evaluation of model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler

# Prepare the data
X = merged_dataset[['URL1_has_official_word', 'URL1_has_abbreviation',
                    'URL2_has_official_word', 'URL2_has_abbreviation',
                    'URL3_has_official_word', 'URL3_has_abbreviation',
                    'URL4_has_official_word', 'URL4_has_abbreviation',
                    'URL5_has_official_word', 'URL5_has_abbreviation',
                    'OfficialName_cleaned_length', 'Abbreviation_length',
                    'URL1_clean_domain_before_tld_official_jaccard',
                    'URL1_clean_domain_before_tld_abbrev_jaccard',
                    'URL1_clean_domain_before_tld_official_is_subsequence',
                    'URL1_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL1_clean_domain_before_tld_official_seq_match',
                    'URL1_clean_domain_before_tld_abbrev_seq_match',
                    'URL1_clean_domain_before_tld_official_levenshtein',
                    'URL1_clean_domain_before_tld_abbrev_levenshtein',
                    'URL1_clean_domain_before_tld_official_cosine_similarity',
                    'URL1_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL1_clean_domain_before_tld_hamming_distance',
                    'URL1_clean_domain_before_tld_ngram_overlap',
                    'URL2_clean_domain_before_tld_official_jaccard',
                    'URL2_clean_domain_before_tld_abbrev_jaccard',
                    'URL2_clean_domain_before_tld_official_is_subsequence',
                    'URL2_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL2_clean_domain_before_tld_official_seq_match',
                    'URL2_clean_domain_before_tld_abbrev_seq_match',
                    'URL2_clean_domain_before_tld_official_levenshtein',
                    'URL2_clean_domain_before_tld_abbrev_levenshtein',
                    'URL2_clean_domain_before_tld_official_cosine_similarity',
                    'URL2_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL2_clean_domain_before_tld_hamming_distance',
                    'URL2_clean_domain_before_tld_ngram_overlap',
                    'URL3_clean_domain_before_tld_official_jaccard',
                    'URL3_clean_domain_before_tld_abbrev_jaccard',
                    'URL3_clean_domain_before_tld_official_is_subsequence',
                    'URL3_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL3_clean_domain_before_tld_official_seq_match',
                    'URL3_clean_domain_before_tld_abbrev_seq_match',
                    'URL3_clean_domain_before_tld_official_levenshtein',
                    'URL3_clean_domain_before_tld_abbrev_levenshtein',
                    'URL3_clean_domain_before_tld_official_cosine_similarity',
                    'URL3_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL3_clean_domain_before_tld_hamming_distance',
                    'URL3_clean_domain_before_tld_ngram_overlap',
                    'URL4_clean_domain_before_tld_official_jaccard',
                    'URL4_clean_domain_before_tld_abbrev_jaccard',
                    'URL4_clean_domain_before_tld_official_is_subsequence',
                    'URL4_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL4_clean_domain_before_tld_official_seq_match',
                    'URL4_clean_domain_before_tld_abbrev_seq_match',
                    'URL4_clean_domain_before_tld_official_levenshtein',
                    'URL4_clean_domain_before_tld_abbrev_levenshtein',
                    'URL4_clean_domain_before_tld_official_cosine_similarity',
                    'URL4_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL4_clean_domain_before_tld_hamming_distance',
                    'URL4_clean_domain_before_tld_ngram_overlap',
                    'URL5_clean_domain_before_tld_official_jaccard',
                    'URL5_clean_domain_before_tld_abbrev_jaccard',
                    'URL5_clean_domain_before_tld_official_is_subsequence',
                    'URL5_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL5_clean_domain_before_tld_official_seq_match',
                    'URL5_clean_domain_before_tld_abbrev_seq_match',
                    'URL5_clean_domain_before_tld_official_levenshtein',
                    'URL5_clean_domain_before_tld_abbrev_levenshtein',
                    'URL5_clean_domain_before_tld_official_cosine_similarity',
                    'URL5_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL5_clean_domain_before_tld_hamming_distance',
                    'URL5_clean_domain_before_tld_ngram_overlap']]

# Encode multilabel target
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(merged_dataset['domain_matches'])

# Split the data using stratified splitting
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Verify Encoding, Verify Stratified Splitting, Check Data Scaling, Evaluate Model Outputs and Debug Total Support

In [None]:
import numpy as np

# Check the shape of the encoded target variable
print("Shape of encoded targets:", y_encoded.shape)

# Check unique label counts
print("Number of unique labels:", len(mlb.classes_))

# Check a few rows to ensure encoding looks correct
print("Sample encoded labels:", y_encoded[:5])

# Check distribution of labels in the full dataset and in the split datasets
print("Label distribution in full dataset:", np.sum(y_encoded, axis=0))
print("Label distribution in training set:", np.sum(y_train, axis=0))
print("Label distribution in test set:", np.sum(y_test, axis=0))

# Check first few rows of scaled features
print("Sample scaled training features:", X_train_scaled[:5])
print("Sample scaled test features:", X_test_scaled[:5])

# Calculate total support manually
total_support = np.sum(y_test, axis=0)
print("Manual calculation of total support:", total_support)
print("Sum of total supports:", np.sum(total_support))

# Verify total support matches manual calculation
assert np.array_equal(total_support, np.sum(y_test, axis=0))


### Logistic Regression


In [None]:
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score
from sklearn.linear_model import LogisticRegression

# Train logistic regression model
logistic_model = MultiOutputClassifier(LogisticRegression(max_iter=3000))
logistic_model.fit(X_train_scaled, y_train)

# Convert mlb.classes_ to a list of strings
target_names = [str(label) for label in mlb.classes_]

# Evaluate the models
print("Logistic Regression Model Evaluation:")
logistic_pred = logistic_model.predict(X_test_scaled)
print(classification_report(y_test, logistic_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - Logistic Regression:", accuracy_score(y_test, logistic_pred))


# Hamming Loss
print("Hamming Loss - Logistic Regression:", hamming_loss(y_test, logistic_pred))


# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - Logistic Regression:", jaccard_score(y_test, logistic_pred, average='samples'))


##### Most important feature

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Retrieve the coefficients from each classifier in the MultiOutputClassifier
coefs = [estimator.coef_ for estimator in logistic_model.estimators_]

# Average the absolute coefficients across all classifiers (you could also choose max, min, etc.)
average_coefs = np.mean([np.abs(coef[0]) for coef in coefs], axis=0)

# Map feature names to average absolute coefficients
feature_names = X.columns
feature_coefficients_map = dict(zip(feature_names, average_coefs))

# Sort the features based on their average absolute coefficients
sorted_features = sorted(feature_coefficients_map.items(), key=lambda x: x[1], reverse=True)

# Limit to top 10 features
top_features = sorted_features[:10]

# Print the sorted list of top 10 features and their coefficients
for feature, coefficient in top_features:
    print(f"Feature: {feature}, Coefficient: {coefficient}")

# Extract features and coefficients for plotting the top 10
features = [x[0] for x in top_features]
coefficients = [x[1] for x in top_features]

# Creating a bar plot for the top 10 features
plt.figure(figsize=(10, 6))
plt.barh(features, coefficients)
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances')
plt.gca().invert_yaxis()  # To display the highest values at the top
plt.savefig('Most_imp_feat_LG')
plt.show()


### Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Train decision tree model using MultiOutputClassifier
tree_model = MultiOutputClassifier(DecisionTreeClassifier())
tree_model.fit(X_train_scaled, y_train)

print("\nDecision Tree Model Evaluation:")
tree_pred = tree_model.predict(X_test_scaled)

print(classification_report(y_test, tree_pred, target_names=target_names))

print("Accuracy - Decision Tree:", accuracy_score(y_test, tree_pred))

print("Hamming Loss - Decision Tree:", hamming_loss(y_test, tree_pred))

print("Jaccard Score - Decision Tree:", jaccard_score(y_test, tree_pred, average='samples'))

##### Feature importance

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Retrieve the feature importances from each classifier in the MultiOutputClassifier
importances = [estimator.feature_importances_ for estimator in tree_model.estimators_]

# Average the feature importances across all classifiers
average_importances = np.mean(importances, axis=0)

# Map feature names to average importances
feature_names = X.columns
feature_importance_map = dict(zip(feature_names, average_importances))

# Sort the features based on their average importances
sorted_features = sorted(feature_importance_map.items(), key=lambda x: x[1], reverse=True)

# Limit to top 10 features
top_features = sorted_features[:10]

# Print the sorted list of top 10 features and their importances
for feature, importance in top_features:
    print(f"Feature: {feature}, Importance: {importance}")

# Extract features and importances for plotting the top 10
features = [x[0] for x in top_features]
importances = [x[1] for x in top_features]

# Creating a bar plot for the top 10 features
plt.figure(figsize=(10, 6))
plt.barh(features, importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances - Decision Tree')
plt.gca().invert_yaxis()  # To display the highest values at the top
plt.savefig('Most_imp_feat_DT')
plt.show()


### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Convert mlb.classes_ to a list of strings
target_names = [str(label) for label in mlb.classes_]

# Train Random Forest model using MultiOutputClassifier
rf_model = MultiOutputClassifier(RandomForestClassifier())
rf_model.fit(X_train_scaled, y_train)

# Evaluate Random Forest model
rf_pred = rf_model.predict(X_test_scaled)
print("Random Forest Model Evaluation:")
print(classification_report(y_test, rf_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - Random Forest:", accuracy_score(y_test, rf_pred))

# Hamming Loss
print("Hamming Loss - Random Forest:", hamming_loss(y_test, rf_pred))

# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - Random Forest:", jaccard_score(y_test, rf_pred, average='samples'))

##### Feature importance

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Retrieve the feature importances from each classifier in the MultiOutputClassifier
importances = [estimator.feature_importances_ for estimator in rf_model.estimators_]

# Average the feature importances across all classifiers
average_importances = np.mean(importances, axis=0)

# Map feature names to average importances
feature_names = X.columns
feature_importance_map = dict(zip(feature_names, average_importances))

# Sort the features based on their average importances
sorted_features = sorted(feature_importance_map.items(), key=lambda x: x[1], reverse=True)

# Limit to top 10 features
top_features = sorted_features[:10]

# Print the sorted list of top 10 features and their importances
for feature, importance in top_features:
    print(f"Feature: {feature}, Importance: {importance}")

# Extract features and importances for plotting the top 10
features = [x[0] for x in top_features]
importances = [x[1] for x in top_features]

# Creating a bar plot for the top 10 features
plt.figure(figsize=(10, 6))
plt.barh(features, importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances - Random Forest')
plt.gca().invert_yaxis()  # To display the highest values at the top
plt.savefig('Most_imp_feat_RF')
plt.show()


### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Train Gradient Boosting model using MultiOutputClassifier
gb_model = MultiOutputClassifier(GradientBoostingClassifier())
gb_model.fit(X_train_scaled, y_train)

# Evaluate Gradient Boosting model
gb_pred = gb_model.predict(X_test_scaled)
print("\nGradient Boosting Model Evaluation:")
print(classification_report(y_test, gb_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - Gradient Boosting Classifier:", accuracy_score(y_test, gb_pred))

# Hamming Loss
print("Hamming Loss - Gradient Boosting Classifier:", hamming_loss(y_test, gb_pred))

# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - Gradient Boosting Classifier:", jaccard_score(y_test, gb_pred, average='samples'))

#### Feature importance

In [None]:
# Retrieve the feature importances from each classifier in the MultiOutputClassifier
importances = [estimator.feature_importances_ for estimator in gb_model.estimators_]

# Average the feature importances across all classifiers
average_importances = np.mean(importances, axis=0)

# Map feature names to average importances
feature_names = X.columns
feature_importance_map = dict(zip(feature_names, average_importances))

# Sort the features based on their average importances
sorted_features = sorted(feature_importance_map.items(), key=lambda x: x[1], reverse=True)

# Limit to top 10 features
top_features = sorted_features[:10]

# Print the sorted list of top 10 features and their importances
for feature, importance in top_features:
    print(f"Feature: {feature}, Importance: {importance}")

# Extract features and importances for plotting the top 10
features = [x[0] for x in top_features]
importances = [x[1] for x in top_features]

# Creating a bar plot for the top 10 features
plt.figure(figsize=(10, 6))
plt.barh(features, importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importances - Gradient Boosting')
plt.gca().invert_yaxis()  # To display the highest values at the top
plt.savefig('Most_imp_feat_GB')
plt.show()


### Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Train SVM model using MultiOutputClassifier
svm_model = MultiOutputClassifier(SVC())
svm_model.fit(X_train_scaled, y_train)

# Evaluate SVM model
svm_pred = svm_model.predict(X_test_scaled)
print("\nSVM Model Evaluation:")
print(classification_report(y_test, svm_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - Support Vector Machine (SVM):", accuracy_score(y_test, svm_pred))

# Hamming Loss
print("Hamming Loss - Support Vector Machine (SVM):", hamming_loss(y_test, svm_pred))

# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - Support Vector Machine (SVM):", jaccard_score(y_test, svm_pred, average='samples'))

### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Train Neural Network model using MultiOutputClassifier
nn_model = MultiOutputClassifier(MLPClassifier())
nn_model.fit(X_train_scaled, y_train)

# Evaluate Neural Network model
nn_pred = nn_model.predict(X_test_scaled)
print("\nNeural Network Model Evaluation:")
print(classification_report(y_test, nn_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - Neural Network:", accuracy_score(y_test, nn_pred))

# Hamming Loss
print("Hamming Loss - Neural Network:", hamming_loss(y_test, nn_pred))

# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - Neural Network:", jaccard_score(y_test, nn_pred, average='samples'))

#### Shap for explaining model predictions

In [None]:
%pip install shap
import shap

# Create a SHAP explainer and calculate SHAP values
explainer = shap.KernelExplainer(nn_model.predict, shap.sample(X_train_scaled, 100))  # using 100 samples for approximation
shap_values = explainer.shap_values(shap.sample(X_test_scaled, 100))

# Plot the summary of SHAP values
shap.summary_plot(shap_values, shap.sample(X_test_scaled, 100), feature_names=X.columns)
plt.savefig('shap_summary_plot.png')
plt.close()

### K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score

# Train KNN model using MultiOutputClassifier
knn_model = MultiOutputClassifier(KNeighborsClassifier())
knn_model.fit(X_train_scaled, y_train)

# Evaluate KNN model
knn_pred = knn_model.predict(X_test_scaled)
print("\nK-Nearest Neighbors Model Evaluation:")
print(classification_report(y_test, knn_pred, target_names=target_names))

# Accuracy (note: this might not be very informative in multi-label settings)
print("Accuracy - K-Nearest Neighbors:", accuracy_score(y_test, knn_pred))

# Hamming Loss
print("Hamming Loss - K-Nearest Neighbors:", hamming_loss(y_test, knn_pred))

# Jaccard Score
# Calculate Jaccard Score for each label as an average
print("Jaccard Score - K-Nearest Neighbors:", jaccard_score(y_test, knn_pred, average='samples'))

### Manual Visualization

In [None]:
import pandas as pd

# Convert logistic predictions into DataFrame
pred_df = pd.DataFrame(rf_pred, columns=[f'Label_{i+1}' for i in range(rf_pred.shape[1])], index=X_test.index)

# Adjusted function to handle non-string label types
def extract_predicted_labels(row):
    labels = []
    for i, value in enumerate(row):
        if value == 1:
            labels.append(str(mlb.classes_[i]))  # Convert label to string
    return ', '.join(labels) if labels else 'No Label Predicted'

# Apply function to rows to create a 'PredictedLabel' column
pred_df['PredictedLabel'] = pred_df.apply(extract_predicted_labels, axis=1)

# Join this with the entity number from the original dataset

final_df = merged_dataset.loc[pred_df.index, ['EntityNumber']].join(pred_df[['PredictedLabel']])

# Here is the final DataFrame
print(final_df.head(50))

# Optional: Save this DataFrame to CSV
# final_df.to_csv('predictions_with_entity_number.csv')


# Use predictions to return correct URL

## Wrongly appointed labels

In [None]:
import pandas as pd
import numpy as np

# Convert predictions and actuals to readable label lists
predicted_labels = ['; '.join(str(label) for label in labels) if labels else 'No Labels Predicted' for labels in mlb.inverse_transform(rf_pred)]
actual_labels = ['; '.join(str(label) for label in labels) if labels else 'No Labels Predicted' for labels in mlb.inverse_transform(y_test)]

# Step 1: Calculate Incorrect Predictions
# Create a boolean array where True indicates a mismatch between prediction and actual label
incorrect_predictions_mask = rf_pred != y_test

# Step 2: Count Incorrect Predictions
# Sum the True values in incorrect_predictions_mask for each sample
incorrect_counts = incorrect_predictions_mask.sum(axis=1)

# Print total number of incorrect label predictions across all samples
total_incorrect_labels = incorrect_counts.sum()
print("Total incorrect label predictions:", total_incorrect_labels)

# Step 3: Create a DataFrame for incorrect predictions
# Include only those samples that have one or more incorrectly predicted labels
samples_with_incorrect_predictions = incorrect_counts > 0

# Convert the lists to Series for proper indexing
predicted_labels_series = pd.Series(predicted_labels, index=X_test.index)
actual_labels_series = pd.Series(actual_labels, index=X_test.index)

# Extract the corresponding rows from X_test

incorrect_predictions_df = pd.DataFrame({
    'EntityNumber': merged_dataset.loc[X_test.index[samples_with_incorrect_predictions], 'EntityNumber'],
    'IncorrectCount': incorrect_counts[samples_with_incorrect_predictions],
    'PredictedLabels': predicted_labels_series[samples_with_incorrect_predictions],
    'ActualLabels': actual_labels_series[samples_with_incorrect_predictions]
})

# Print or explore the DataFrame
print(incorrect_predictions_df.head(20))

# Optional: Save this DataFrame to a CSV file
# incorrect_predictions_df.to_csv('incorrect_predictions.csv')



# Hyperparameter tuning

### Logistic Regression

In [None]:
%pip install iterative-stratification

import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Parameters for GridSearchCV
param_grid = {
    'estimator__C': [0.1, 1, 10],
    'estimator__penalty': ['l2'],  # Removed l1 as it's not compatible with 'liblinear' for 'ovr'
    'estimator__solver': ['liblinear']
}

# Create the logistic regression model within a MultiOutputClassifier
logistic_model = MultiOutputClassifier(LogisticRegression(max_iter=3000))

# Setup stratified cross-validation folds for multi-label data
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_weighted_scorer = make_scorer(f1_score, average='weighted')


# Initialize GridSearchCV with multi-label stratified folds and f1_micro as the scoring function
grid_search = GridSearchCV(logistic_model, param_grid, scoring=f1_weighted_scorer, cv=mskf, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Best estimator and score found by GridSearchCV
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

df_new_results = pd.DataFrame([grid_search.best_params_], index=[0])
df_new_results['model'] = 'Logistic Regression'

# Check if the results CSV file exists
if os.path.exists('hyperparameter_tuning_results.csv'):
    existing_results = pd.read_csv('hyperparameter_tuning_results.csv')
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv('hyperparameter_tuning_results.csv', index=False)



### Decision Tree

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import GridSearchCV
import os

# Parameters for GridSearchCV
param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

# Setup stratified cross-validation folds for multi-label data
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Decision tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)

f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# GridSearchCV setup
grid_search_dt = GridSearchCV(decision_tree_model, param_grid_dt, cv=mskf, scoring=f1_weighted_scorer, verbose=1)
grid_search_dt.fit(X_train, y_train)

# Evaluate and print best parameters and model performance
print("Best parameters:", grid_search_dt.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_dt.best_score_))

# Create a DataFrame from the best parameters and add a column specifying the model
df_new_results = pd.DataFrame([grid_search_dt.best_params_], index=[0])
df_new_results['model'] = 'Decision Tree'

# Check if the results CSV file exists
if os.path.exists('hyperparameter_tuning_results.csv'):
    existing_results = pd.read_csv('hyperparameter_tuning_results.csv')
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv('hyperparameter_tuning_results.csv', index=False)

### Gradient Booster Classifier

In [None]:
import pandas as pd
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Parameters for GridSearchCV
param_grid_gb = {
    'estimator__n_estimators': [100, 200],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 5, 8]
}

# Setup stratified cross-validation folds for multi-label data
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Gradient boosting model wrapped in MultiOutputClassifier
gradient_boosting_model = MultiOutputClassifier(GradientBoostingClassifier(random_state=42))

f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# GridSearchCV setup
grid_search_gb = GridSearchCV(gradient_boosting_model, param_grid_gb, cv=mskf, scoring=f1_weighted_scorer, verbose=1)
grid_search_gb.fit(X_train, y_train)

# Evaluate and print best parameters and model performance
print("Best parameters:", grid_search_gb.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_gb.best_score_))

# Create a DataFrame from the best parameters and add a column specifying the model
df_new_results = pd.DataFrame([grid_search_gb.best_params_], index=[0])
df_new_results['model'] = 'Gradient Boosting'

# Check if the results CSV file exists
if os.path.exists('hyperparameter_tuning_results.csv'):
    existing_results = pd.read_csv('hyperparameter_tuning_results.csv')
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv('hyperparameter_tuning_results.csv', index=False)


### Random Forest

In [None]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Parameters for GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [1, 2, 4]
}

# Setup stratified cross-validation folds for multi-label data
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Random forest model wrapped in MultiOutputClassifier (if needed, otherwise just RandomForestClassifier)
random_forest_model = RandomForestClassifier(random_state=42)

f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# GridSearchCV setup
grid_search_rf = GridSearchCV(random_forest_model, param_grid_rf, cv=mskf, scoring=f1_weighted_scorer, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Evaluate and print best parameters and model performance
print("Best parameters:", grid_search_rf.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_rf.best_score_))

# Create a DataFrame from the best parameters and add a column specifying the model
df_new_results = pd.DataFrame([grid_search_rf.best_params_], index=[0])
df_new_results['model'] = 'Random Forest'

# Check if the results CSV file exists
if os.path.exists('hyperparameter_tuning_results.csv'):
    existing_results = pd.read_csv('hyperparameter_tuning_results.csv')
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv('hyperparameter_tuning_results.csv', index=False)


### SVM

In [25]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Initialize Multilabel Stratified K-Fold
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Set up the parameter grid for SVM
param_grid_svm = {
    'estimator__svc__C': [0.1, 1, 10],
    'estimator__svc__gamma': ['scale', 'auto'],
    'estimator__svc__kernel': ['rbf', 'linear']
}

# Create a pipeline that first scales the data then applies SVM
pipeline_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(random_state=42))
])

# Wrap the pipeline and SVM in OneVsRestClassifier
ovr_pipeline = OneVsRestClassifier(pipeline_svm)

f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# Setup GridSearchCV
grid_search_svm = GridSearchCV(ovr_pipeline, param_grid_svm, cv=mskf, scoring=f1_weighted_scorer, verbose=1)
grid_search_svm.fit(X_train, y_train)

# Output the results
print("Best parameters:", grid_search_svm.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_svm.best_score_))

# Create a DataFrame from the best parameters and add a column specifying the model
df_new_results = pd.DataFrame([grid_search_svm.best_params_], index=[0])
df_new_results['model'] = 'SVM'

# Check if the results CSV file exists
if os.path.exists('hyperparameter_tuning_results.csv'):
    existing_results = pd.read_csv('hyperparameter_tuning_results.csv')
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv('hyperparameter_tuning_results.csv', index=False)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'estimator__svc__C': 1, 'estimator__svc__gamma': 'scale', 'estimator__svc__kernel': 'rbf'}
Best cross-validation score: 0.80


### Neural Networks

In [30]:
import tensorflow as tf

def weighted_f1_score(y_true, y_pred):
    y_pred_binary = tf.cast(tf.greater_equal(y_pred, 0.5), tf.int32)
    true_positives = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true, 1), tf.equal(y_pred_binary, 1)), tf.float32), axis=0)
    predicted_positives = tf.reduce_sum(tf.cast(y_pred_binary, tf.float32), axis=0)
    possible_positives = tf.reduce_sum(tf.cast(y_true, tf.float32), axis=0)

    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    recall = true_positives / (possible_positives + tf.keras.backend.epsilon())

    f1_val = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
    weighted_f1 = tf.reduce_mean(f1_val)  # Using mean F1 score instead of a weighted version
    return weighted_f1


In [None]:
%pip install keras keras-tuner
import tensorflow as tf
from tensorflow import keras
from keras_tuner import RandomSearch
import pandas as pd
import os

def build_model(hp):
    model = keras.Sequential([
        keras.layers.InputLayer(input_shape=(X_train.shape[1],)),
        keras.layers.Dense(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'),
        keras.layers.Dense(units=hp.Int('units', min_value=32, max_value=512, step=32), activation='relu'),
        keras.layers.Dense(6, activation='sigmoid')
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3)),
                  loss='binary_crossentropy',
                  metrics=[weighted_f1_score])
    return model

# Create a tuner object
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=3,
    directory='my_dir',
    project_name='keras_tuning'
)

# Execute the hyperparameter search
tuner.search(X_train, y_train, epochs=50, validation_split=0.2)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Prepare the best hyperparameters to be saved
best_params = {
    'model': 'Neural Network',
    'best_units': best_hps.get('units'),
    'best_learning_rate': best_hps.get('learning_rate')
}
df_new_results = pd.DataFrame([best_params])

# Check if the results CSV file exists
results_file = 'hyperparameter_tuning_results.csv'
if os.path.exists(results_file):
    existing_results = pd.read_csv(results_file)
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv(results_file, index=False)

print(f"Best number of units: {best_hps.get('units')}")
print(f"Best learning rate: {best_hps.get('learning_rate')}")



### KNN

In [None]:
import pandas as pd
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Parameters for GridSearchCV
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance']
}

# Setup stratified cross-validation folds for multi-label data
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# KNN model wrapped in MultiOutputClassifier (if needed)
knn_model = KNeighborsClassifier()

f1_weighted_scorer = make_scorer(f1_score, average='weighted')

# GridSearchCV setup
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=mskf, scoring=f1_weighted_scorer, verbose=1)
grid_search_knn.fit(X_train, y_train)

# Evaluate and print best parameters and model performance
print("Best parameters:", grid_search_knn.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search_knn.best_score_))

# Create a DataFrame from the best parameters and add a column specifying the model
df_new_results = pd.DataFrame([grid_search_knn.best_params_], index=[0])
df_new_results['model'] = 'KNN'

# Check if the results CSV file exists
results_file = 'hyperparameter_tuning_results.csv'
if os.path.exists(results_file):
    existing_results = pd.read_csv(results_file)
    combined_results = pd.concat([existing_results, df_new_results], ignore_index=True)
else:
    combined_results = df_new_results

# Save the updated results back to CSV
combined_results.to_csv(results_file, index=False)


# Save RF with hyperparameters using pickle

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score, hamming_loss, jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pickle

X = merged_dataset[['URL1_has_official_word', 'URL1_has_abbreviation',
                    'URL2_has_official_word', 'URL2_has_abbreviation',
                    'URL3_has_official_word', 'URL3_has_abbreviation',
                    'URL4_has_official_word', 'URL4_has_abbreviation',
                    'URL5_has_official_word', 'URL5_has_abbreviation',
                    'OfficialName_cleaned_length', 'Abbreviation_length',
                    'URL1_clean_domain_before_tld_official_jaccard',
                    'URL1_clean_domain_before_tld_abbrev_jaccard',
                    'URL1_clean_domain_before_tld_official_is_subsequence',
                    'URL1_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL1_clean_domain_before_tld_official_seq_match',
                    'URL1_clean_domain_before_tld_abbrev_seq_match',
                    'URL1_clean_domain_before_tld_official_levenshtein',
                    'URL1_clean_domain_before_tld_abbrev_levenshtein',
                    'URL1_clean_domain_before_tld_official_cosine_similarity',
                    'URL1_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL1_clean_domain_before_tld_hamming_distance',
                    'URL1_clean_domain_before_tld_ngram_overlap',
                    'URL2_clean_domain_before_tld_official_jaccard',
                    'URL2_clean_domain_before_tld_abbrev_jaccard',
                    'URL2_clean_domain_before_tld_official_is_subsequence',
                    'URL2_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL2_clean_domain_before_tld_official_seq_match',
                    'URL2_clean_domain_before_tld_abbrev_seq_match',
                    'URL2_clean_domain_before_tld_official_levenshtein',
                    'URL2_clean_domain_before_tld_abbrev_levenshtein',
                    'URL2_clean_domain_before_tld_official_cosine_similarity',
                    'URL2_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL2_clean_domain_before_tld_hamming_distance',
                    'URL2_clean_domain_before_tld_ngram_overlap',
                    'URL3_clean_domain_before_tld_official_jaccard',
                    'URL3_clean_domain_before_tld_abbrev_jaccard',
                    'URL3_clean_domain_before_tld_official_is_subsequence',
                    'URL3_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL3_clean_domain_before_tld_official_seq_match',
                    'URL3_clean_domain_before_tld_abbrev_seq_match',
                    'URL3_clean_domain_before_tld_official_levenshtein',
                    'URL3_clean_domain_before_tld_abbrev_levenshtein',
                    'URL3_clean_domain_before_tld_official_cosine_similarity',
                    'URL3_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL3_clean_domain_before_tld_hamming_distance',
                    'URL3_clean_domain_before_tld_ngram_overlap',
                    'URL4_clean_domain_before_tld_official_jaccard',
                    'URL4_clean_domain_before_tld_abbrev_jaccard',
                    'URL4_clean_domain_before_tld_official_is_subsequence',
                    'URL4_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL4_clean_domain_before_tld_official_seq_match',
                    'URL4_clean_domain_before_tld_abbrev_seq_match',
                    'URL4_clean_domain_before_tld_official_levenshtein',
                    'URL4_clean_domain_before_tld_abbrev_levenshtein',
                    'URL4_clean_domain_before_tld_official_cosine_similarity',
                    'URL4_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL4_clean_domain_before_tld_hamming_distance',
                    'URL4_clean_domain_before_tld_ngram_overlap',
                    'URL5_clean_domain_before_tld_official_jaccard',
                    'URL5_clean_domain_before_tld_abbrev_jaccard',
                    'URL5_clean_domain_before_tld_official_is_subsequence',
                    'URL5_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL5_clean_domain_before_tld_official_seq_match',
                    'URL5_clean_domain_before_tld_abbrev_seq_match',
                    'URL5_clean_domain_before_tld_official_levenshtein',
                    'URL5_clean_domain_before_tld_abbrev_levenshtein',
                    'URL5_clean_domain_before_tld_official_cosine_similarity',
                    'URL5_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL5_clean_domain_before_tld_hamming_distance',
                    'URL5_clean_domain_before_tld_ngram_overlap']]

# Encode multilabel target
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(merged_dataset['domain_matches'])

# Split the data using stratified splitting
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Random Forest classifier with specified parameters
rf_classifier = RandomForestClassifier(max_depth=20, min_samples_leaf=2, n_estimators=200, random_state=42)

# Train Random Forest model using MultiOutputClassifier
rf_model = MultiOutputClassifier(rf_classifier)
rf_model.fit(X_train_scaled, y_train)

# Saving the model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)


## Gradient boost model save

In [35]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pickle

X = merged_dataset[['URL1_has_official_word', 'URL1_has_abbreviation',
                    'URL2_has_official_word', 'URL2_has_abbreviation',
                    'URL3_has_official_word', 'URL3_has_abbreviation',
                    'URL4_has_official_word', 'URL4_has_abbreviation',
                    'URL5_has_official_word', 'URL5_has_abbreviation',
                    'OfficialName_cleaned_length', 'Abbreviation_length',
                    'URL1_clean_domain_before_tld_official_jaccard',
                    'URL1_clean_domain_before_tld_abbrev_jaccard',
                    'URL1_clean_domain_before_tld_official_is_subsequence',
                    'URL1_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL1_clean_domain_before_tld_official_seq_match',
                    'URL1_clean_domain_before_tld_abbrev_seq_match',
                    'URL1_clean_domain_before_tld_official_levenshtein',
                    'URL1_clean_domain_before_tld_abbrev_levenshtein',
                    'URL1_clean_domain_before_tld_official_cosine_similarity',
                    'URL1_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL1_clean_domain_before_tld_hamming_distance',
                    'URL1_clean_domain_before_tld_ngram_overlap',
                    'URL2_clean_domain_before_tld_official_jaccard',
                    'URL2_clean_domain_before_tld_abbrev_jaccard',
                    'URL2_clean_domain_before_tld_official_is_subsequence',
                    'URL2_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL2_clean_domain_before_tld_official_seq_match',
                    'URL2_clean_domain_before_tld_abbrev_seq_match',
                    'URL2_clean_domain_before_tld_official_levenshtein',
                    'URL2_clean_domain_before_tld_abbrev_levenshtein',
                    'URL2_clean_domain_before_tld_official_cosine_similarity',
                    'URL2_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL2_clean_domain_before_tld_hamming_distance',
                    'URL2_clean_domain_before_tld_ngram_overlap',
                    'URL3_clean_domain_before_tld_official_jaccard',
                    'URL3_clean_domain_before_tld_abbrev_jaccard',
                    'URL3_clean_domain_before_tld_official_is_subsequence',
                    'URL3_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL3_clean_domain_before_tld_official_seq_match',
                    'URL3_clean_domain_before_tld_abbrev_seq_match',
                    'URL3_clean_domain_before_tld_official_levenshtein',
                    'URL3_clean_domain_before_tld_abbrev_levenshtein',
                    'URL3_clean_domain_before_tld_official_cosine_similarity',
                    'URL3_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL3_clean_domain_before_tld_hamming_distance',
                    'URL3_clean_domain_before_tld_ngram_overlap',
                    'URL4_clean_domain_before_tld_official_jaccard',
                    'URL4_clean_domain_before_tld_abbrev_jaccard',
                    'URL4_clean_domain_before_tld_official_is_subsequence',
                    'URL4_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL4_clean_domain_before_tld_official_seq_match',
                    'URL4_clean_domain_before_tld_abbrev_seq_match',
                    'URL4_clean_domain_before_tld_official_levenshtein',
                    'URL4_clean_domain_before_tld_abbrev_levenshtein',
                    'URL4_clean_domain_before_tld_official_cosine_similarity',
                    'URL4_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL4_clean_domain_before_tld_hamming_distance',
                    'URL4_clean_domain_before_tld_ngram_overlap',
                    'URL5_clean_domain_before_tld_official_jaccard',
                    'URL5_clean_domain_before_tld_abbrev_jaccard',
                    'URL5_clean_domain_before_tld_official_is_subsequence',
                    'URL5_clean_domain_before_tld_abbrev_is_subsequence',
                    'URL5_clean_domain_before_tld_official_seq_match',
                    'URL5_clean_domain_before_tld_abbrev_seq_match',
                    'URL5_clean_domain_before_tld_official_levenshtein',
                    'URL5_clean_domain_before_tld_abbrev_levenshtein',
                    'URL5_clean_domain_before_tld_official_cosine_similarity',
                    'URL5_clean_domain_before_tld_abbrev_cosine_similarity',
                    'URL5_clean_domain_before_tld_hamming_distance',
                    'URL5_clean_domain_before_tld_ngram_overlap']]

# Encode multilabel target
mlb = MultiLabelBinarizer()
y_encoded = mlb.fit_transform(merged_dataset['domain_matches'])

# Split the data using stratified splitting
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gb_model = MultiOutputClassifier(GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=100))
gb_model.fit(X_train_scaled, y_train)

# Saving the model
with open('gradient_boosting_classifier.pkl', 'wb') as file:
    pickle.dump(gb_model, file)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------