# Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor
import re
import random

# # Set random seeds
# seed = 42
# torch.manual_seed(seed)
# np.random.seed(seed)
# random.seed(seed)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(seed)

# Importing dataset and pre-processing

In [2]:
# This is the dataset obtained by removing the counties whose details are not availabel from the paper submitted to ICAPS 2025 details are provided in the paper
df= pd.read_csv("/home/kamal/Desktop/ICNS-2025/Data/df_finalDatasetCombinedFeaturesRemoving.csv")

In [3]:
for col in df.columns:
    # Remove commas from column values
    df[col] = df[col].astype(str).str.replace(',', '')
    
    # Attempt to convert to numeric, handling exceptions explicitly
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        print(f"Column '{col}' contains non-numeric values that could not be converted.")

Column 'day' contains non-numeric values that could not be converted.


In [4]:
# Map 'Weekday' to 0 and 'Weekend' to 1 in the 'day' column
df['day'] = df['day'].map({'Weekday': 0, 'Weekend': 1})

In [5]:
# Working on Origin POIs
df.rename(columns={
    'Public Services and Infrastructure_Origin': 'PublicServicesPOIs_Origin',
    'Healthcare and Well-being_Origin': 'HealthcarePOIs_Origin',
    'Educational and Learning_Origin': 'EducationalPOIs_Origin',
    'Retail and Commercial_Origin': 'CommercialPOIs_Origin'
}, inplace=True)

# Summing columns and creating new ones as specified
df['RecreationalAndCulturalPOIs_Origin'] = df['Recreational and Leisure_Origin'] + df['Cultural and Historical_Origin']
df['MiscellaniousPOIs_Origin'] = df['Transport and Vehicle Services_Origin'] + df['Miscellaneous and Utility_Origin']

# Dropping the original columns that were summed if no longer needed
df.drop(['Recreational and Leisure_Origin', 'Cultural and Historical_Origin', 
         'Transport and Vehicle Services_Origin', 'Miscellaneous and Utility_Origin'], axis=1, inplace=True)

In [6]:
# Working on Destination POIs
df.rename(columns={
    'Public Services and Infrastructure_Destination': 'PublicServicesPOIs_Destination',
    'Healthcare and Well-being_Destination': 'HealthcarePOIs_Destination',
    'Educational and Learning_Destination': 'EducationalPOIs_Destination',
    'Retail and Commercial_Destination': 'CommercialPOIs_Destination'
}, inplace=True)

# Summing columns and creating new ones as specified
df['RecreationalAndCulturalPOIs_Destination'] = df['Recreational and Leisure_Destination'] + df['Cultural and Historical_Destination']
df['MiscellaniousPOIs_Destination'] = df['Transport and Vehicle Services_Destination'] + df['Miscellaneous and Utility_Destination']

# Dropping the original columns that were summed if no longer needed
df.drop(['Recreational and Leisure_Destination', 'Cultural and Historical_Destination', 
         'Transport and Vehicle Services_Destination', 'Miscellaneous and Utility_Destination'], axis=1, inplace=True)

In [7]:
df['POIs_Origin']= df['RecreationalAndCulturalPOIs_Origin'] + df['MiscellaniousPOIs_Origin'] + df['PublicServicesPOIs_Origin'] + df['HealthcarePOIs_Destination'] + df['EducationalPOIs_Origin'] + df['CommercialPOIs_Origin']

# Dropping the original columns that were summed if no longer needed
df.drop(['RecreationalAndCulturalPOIs_Origin', 'MiscellaniousPOIs_Origin', 
         'PublicServicesPOIs_Origin', 'HealthcarePOIs_Origin', 'EducationalPOIs_Origin', 'CommercialPOIs_Origin'], axis=1, inplace=True)


df['POIs_Destination']= df['RecreationalAndCulturalPOIs_Destination'] + df['MiscellaniousPOIs_Destination'] + df['PublicServicesPOIs_Destination'] + df['HealthcarePOIs_Destination'] + df['EducationalPOIs_Destination'] + df['CommercialPOIs_Destination']

# Dropping the original columns that were summed if no longer needed
df.drop(['RecreationalAndCulturalPOIs_Destination', 'MiscellaniousPOIs_Destination', 
         'PublicServicesPOIs_Destination', 'HealthcarePOIs_Destination', 'EducationalPOIs_Destination', 'CommercialPOIs_Destination'], axis=1, inplace=True)

In [8]:
df.rename(columns={
    'Residential_Origin': 'ResidentialCounts_Origin',
    'Public_Origin': 'PublicCounts_Origin'
}, inplace=True)

# Summing columns and creating new ones as specified
df['NaturalAreaCounts_Origin'] = df['Natural_Origin'] + df['Agricultural_Origin']
df['CommercialCounts_Origin'] = df['Commercial_Origin'] + df['Industrial_Origin']  # Summing Commercial and Industrial

# Dropping the original columns that were summed and the 'Military' column as specified
df.drop(['Natural_Origin', 'Agricultural_Origin', 'Commercial_Origin', 'Industrial_Origin', 'Military_Origin'], axis=1, inplace=True)

In [9]:
df.rename(columns={
    'Residential_Destination': 'ResidentialCounts_Destination',
    'Public_Destination': 'PublicCounts_Destination'
}, inplace=True)

# Summing columns and creating new ones as specified
df['NaturalAreaCounts_Destination'] = df['Natural_Destination'] + df['Agricultural_Destination']
df['CommercialCounts_Destination'] = df['Commercial_Destination'] + df['Industrial_Destination']  # Summing Commercial and Industrial

# Dropping the original columns that were summed and the 'Military' column as specified
df.drop(['Natural_Destination', 'Agricultural_Destination', 'Commercial_Destination', 'Industrial_Destination', 'Military_Destination'], axis=1, inplace=True)

In [10]:
df['PublicAreaCounts_Origin'] = df['ResidentialCounts_Origin'] + df['PublicCounts_Origin'] + df['CommercialCounts_Origin'] + df['buildings_counts_Origin']
# Dropping the original columns 
df.drop(['ResidentialCounts_Origin', 'PublicCounts_Origin', 'CommercialCounts_Origin', 'buildings_counts_Origin'], axis=1, inplace=True)

df['PublicAreaCounts_Destination'] = df['ResidentialCounts_Destination'] + df['PublicCounts_Destination'] + df['CommercialCounts_Destination'] + df['buildings_counts_Destination']
# Dropping the original columns 
df.drop(['ResidentialCounts_Destination', 'PublicCounts_Destination', 'CommercialCounts_Destination', 'buildings_counts_Destination'], axis=1, inplace=True)


In [11]:
# Summing columns for 'Major Roads' and renaming 'Other Roads_Origin' as specified
df['MajorRoads_Origin'] = df['Primary and Major Roads_Origin'] + df['Secondary and Tertiary Roads_Origin']
df.rename(columns={'Other Roads_Origin': 'OtherRoads_Origin'}, inplace=True)

# Dropping the original columns used in the summation if no longer needed
df.drop(['Primary and Major Roads_Origin', 'Secondary and Tertiary Roads_Origin'], axis=1, inplace=True)


In [12]:
# Summing columns for 'Major Roads' and renaming 'Other Roads_Origin' as specified
df['MajorRoads_Destination'] = df['Primary and Major Roads_Destination'] + df['Secondary and Tertiary Roads_Destination']
df.rename(columns={'Other Roads_Destination': 'OtherRoads_Destination'}, inplace=True)

# Dropping the original columns used in the summation if no longer needed
df.drop(['Primary and Major Roads_Destination', 'Secondary and Tertiary Roads_Destination'], axis=1, inplace=True)

In [13]:
# Dropping the original columns used in the summation if no longer needed
df.drop(['Economic Well-Being_Origin', 'Education_Origin', 'Health_Origin', 'Family & Community_Origin', 'Economic Well-Being_Destination', 'Education_Destination', 'Health_Destination', 'Family & Community_Destination'], axis=1, inplace=True)
df.rename(columns={'Overall Ranking_Origin': 'Ranking_Origin'}, inplace=True)
df.rename(columns={'Overall Ranking_Destination': 'Ranking_Destination'}, inplace=True)

In [14]:
df.rename(columns={
    'Unemployment Rate_Origin': 'UnemploymentRate_Origin',
    'Unemployment Rate_Destination': 'UnemploymentRate_Destination',
    'Employed People_Origin': 'EmployedPeople_Origin',
    'Employed People_Destination': 'EmployedPeople_Destination',
    'Sales Tax Revenue_Origin' : 'SalesTaxRevenue_Origin',
    'Sales Tax Revenue_Destination' : 'SalesTaxRevenue_Destination'
}, inplace=True)

df.columns

Index(['distance_miles', 'duration_seconds', 'pop_flows', 'day',
       'UnemploymentRate_Origin', 'EmployedPeople_Origin',
       'SalesTaxRevenue_Origin', 'Ranking_Origin', 'Population_Origin',
       'OtherRoads_Origin', 'Transport_Origin', 'UnemploymentRate_Destination',
       'EmployedPeople_Destination', 'SalesTaxRevenue_Destination',
       'Ranking_Destination', 'Population_Destination',
       'OtherRoads_Destination', 'Transport_Destination', 'POIs_Origin',
       'POIs_Destination', 'NaturalAreaCounts_Origin',
       'NaturalAreaCounts_Destination', 'PublicAreaCounts_Origin',
       'PublicAreaCounts_Destination', 'MajorRoads_Origin',
       'MajorRoads_Destination'],
      dtype='object')

In [None]:
df_Allfeatures=df.copy()
df_Allfeatures # Contain 25 features + 1

# df_Allfeatures.to_csv("/home/kamal/Desktop/ICNS-2025/Data/df_Allfeatures.csv", index=False)

Unnamed: 0,distance_miles,duration_seconds,pop_flows,day,UnemploymentRate_Origin,EmployedPeople_Origin,SalesTaxRevenue_Origin,Ranking_Origin,Population_Origin,OtherRoads_Origin,...,OtherRoads_Destination,Transport_Destination,POIs_Origin,POIs_Destination,NaturalAreaCounts_Origin,NaturalAreaCounts_Destination,PublicAreaCounts_Origin,PublicAreaCounts_Destination,MajorRoads_Origin,MajorRoads_Destination
0,178.568231,14515.8,42.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
1,178.568231,14515.8,56.0,1,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
2,178.568231,14515.8,27.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
3,178.568231,14515.8,56.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
4,178.568231,14515.8,41.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115143,106.025528,8971.4,24.0,0,2.7,127667,16592251,1,256065,36818,...,2270,3,661,79,714,91,20549,1686,2534,221
115144,106.025528,8971.4,36.0,0,2.7,127667,16592251,1,256065,36818,...,2270,3,661,79,714,91,20549,1686,2534,221
115145,106.025528,8971.4,75.0,0,2.7,127667,16592251,1,256065,36818,...,2270,3,661,79,714,91,20549,1686,2534,221
115146,106.025528,8971.4,12.0,0,2.7,127667,16592251,1,256065,36818,...,2270,3,661,79,714,91,20549,1686,2534,221


# Generating the rules from Decision Tree using the original dataset

In [16]:
df.head()

Unnamed: 0,distance_miles,duration_seconds,pop_flows,day,UnemploymentRate_Origin,EmployedPeople_Origin,SalesTaxRevenue_Origin,Ranking_Origin,Population_Origin,OtherRoads_Origin,...,OtherRoads_Destination,Transport_Destination,POIs_Origin,POIs_Destination,NaturalAreaCounts_Origin,NaturalAreaCounts_Destination,PublicAreaCounts_Origin,PublicAreaCounts_Destination,MajorRoads_Origin,MajorRoads_Destination
0,178.568231,14515.8,42.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
1,178.568231,14515.8,56.0,1,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
2,178.568231,14515.8,27.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
3,178.568231,14515.8,56.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343
4,178.568231,14515.8,41.0,0,4.2,33277,4273753,42,77615,18502,...,7244,10,607,103,350,335,23660,10948,599,343


In [17]:
# Function to calculate VIF and filter features
def calculate_vif(df, threshold=10.0):
    # Create a DataFrame to hold features and VIF values
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    
    # Iteratively remove features with VIF > threshold
    while vif_data["VIF"].max() > threshold:
        # Find the feature with the highest VIF
        highest_vif_feature = vif_data.sort_values("VIF", ascending=False).iloc[0]
        print(f"Dropping '{highest_vif_feature['Feature']}' with VIF = {highest_vif_feature['VIF']}")
        
        # Drop the feature with the highest VIF from the DataFrame
        df = df.drop(columns=[highest_vif_feature["Feature"]])
        
        # Recalculate VIF for remaining features
        vif_data = pd.DataFrame()
        vif_data["Feature"] = df.columns
        vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    
    return df, vif_data

# Example usage with your DataFrame (assuming df is the DataFrame of features)
df_TopTenFeatures, final_vif_data = calculate_vif(df, threshold=10.0)

# Display the final features and their VIF values
print("Selected features after removing high VIF columns:")
print(final_vif_data)


Dropping 'EmployedPeople_Destination' with VIF = 486.40765761878686
Dropping 'EmployedPeople_Origin' with VIF = 484.70115822967256
Dropping 'duration_seconds' with VIF = 376.5874933475497
Dropping 'Population_Origin' with VIF = 79.8312239210273
Dropping 'Population_Destination' with VIF = 73.27270190918432
Dropping 'SalesTaxRevenue_Origin' with VIF = 52.96815253989416
Dropping 'SalesTaxRevenue_Destination' with VIF = 51.4741628811059
Dropping 'OtherRoads_Destination' with VIF = 42.23629729210593
Dropping 'OtherRoads_Origin' with VIF = 40.14953761455752
Dropping 'UnemploymentRate_Origin' with VIF = 35.78916329651481
Dropping 'PublicAreaCounts_Origin' with VIF = 22.280647058934516
Dropping 'PublicAreaCounts_Destination' with VIF = 21.718548732603736
Dropping 'MajorRoads_Origin' with VIF = 13.372327444626713
Dropping 'MajorRoads_Destination' with VIF = 12.750711829576433
Dropping 'UnemploymentRate_Destination' with VIF = 10.88330813294752
Selected features after removing high VIF columns:

In [18]:
# df_TopTenFeatures.to_csv("/home/kamal/Desktop/ICNS-2025/Data/df_TopTenFeatures.csv", index=False)
df_TopTenFeatures # Ten columns + 1

Unnamed: 0,distance_miles,pop_flows,day,Ranking_Origin,Transport_Origin,Ranking_Destination,Transport_Destination,POIs_Origin,POIs_Destination,NaturalAreaCounts_Origin,NaturalAreaCounts_Destination
0,178.568231,42.0,0,42,2,78,10,607,103,350,335
1,178.568231,56.0,1,42,2,78,10,607,103,350,335
2,178.568231,27.0,0,42,2,78,10,607,103,350,335
3,178.568231,56.0,0,42,2,78,10,607,103,350,335
4,178.568231,41.0,0,42,2,78,10,607,103,350,335
...,...,...,...,...,...,...,...,...,...,...,...
115143,106.025528,24.0,0,1,2,35,3,661,79,714,91
115144,106.025528,36.0,0,1,2,35,3,661,79,714,91
115145,106.025528,75.0,0,1,2,35,3,661,79,714,91
115146,106.025528,12.0,0,1,2,35,3,661,79,714,91


In [19]:
# Create a new dataframe with only the top features and target column
X_top = df_TopTenFeatures.drop('pop_flows', axis=1)
y = df_TopTenFeatures['pop_flows']  # assuming 'pop_flows' is your target

# # Mapping 'Weekday' to 0 and 'Weekend' to 1
# X_top.loc[:, 'day'] = X_top['day'].map({'Weekday': 0, 'Weekend': 1})

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.3, random_state=42)

# Train the decision tree model
# Set max_depth to control the depth of the tree (e.g., max_depth=3 for simpler rules)
dt = DecisionTreeRegressor(max_depth=3, random_state=42)
dt.fit(X_train, y_train)

# Generate rules
tree_rules = export_text(dt, feature_names=X_top.columns.tolist())
# print(tree_rules)

In [20]:
n_nodes = dt.tree_.node_count
children_left = dt.tree_.children_left
children_right = dt.tree_.children_right
feature = dt.tree_.feature
threshold = dt.tree_.threshold

# Function to extract rules
def extract_rules(tree, feature_names):
    rules = []

    def recurse(node, conditions):
        if children_left[node] != children_right[node]:  # not a leaf
            # Get the feature and threshold for the current node
            name = feature_names[feature[node]]
            threshold_value = threshold[node]

            # Create conditions for left and right child nodes
            left_condition = conditions + [f"{name} <= {threshold_value}"]
            right_condition = conditions + [f"{name} > {threshold_value}"]

            # Recurse for both children
            recurse(children_left[node], left_condition)
            recurse(children_right[node], right_condition)
        else:
            # Leaf node: store the path conditions and leaf value
            leaf_value = tree.tree_.value[node]
            rules.append((conditions, leaf_value))

    recurse(0, [])
    return rules

# Extract rules from the decision tree
rules = extract_rules(dt, X_top.columns.tolist())

In [21]:
# Number of rules
num_rules = len(rules)
print(f"Total number of rules: {num_rules}")

Total number of rules: 8


In [22]:
# Sort rules by the mean of leaf values in descending order
sorted_rules = sorted(rules, key=lambda x: abs(np.mean(x[1])), reverse=True)

# Select the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules
print("Top 10 rules based on mean leaf values:")
for i, (conditions, leaf_value) in enumerate(top_10_rules, start=1):
    print(f"Rule {i}: Conditions - {conditions}, Mean Leaf Value - {np.mean(leaf_value)}")

Top 10 rules based on mean leaf values:
Rule 1: Conditions - ['distance_miles <= 46.075748443603516', 'POIs_Destination > 323.0', 'POIs_Origin > 307.0'], Mean Leaf Value - 31867.8064516129
Rule 2: Conditions - ['distance_miles <= 46.075748443603516', 'POIs_Destination > 323.0', 'POIs_Origin <= 307.0'], Mean Leaf Value - 9709.8055028463
Rule 3: Conditions - ['distance_miles <= 46.075748443603516', 'POIs_Destination <= 323.0', 'POIs_Destination > 243.0'], Mean Leaf Value - 6492.031900138696
Rule 4: Conditions - ['distance_miles > 46.075748443603516', 'distance_miles <= 58.773027420043945', 'NaturalAreaCounts_Destination > 935.5'], Mean Leaf Value - 2728.7040498442366
Rule 5: Conditions - ['distance_miles <= 46.075748443603516', 'POIs_Destination <= 323.0', 'POIs_Destination <= 243.0'], Mean Leaf Value - 1648.2219548527557
Rule 6: Conditions - ['distance_miles > 46.075748443603516', 'distance_miles > 58.773027420043945', 'POIs_Destination > 773.5'], Mean Leaf Value - 544.0879545210769
Rul

In [96]:
# Select the top 100 rules (assuming `rules` contains all extracted rules sorted by importance)
top_10_rules = sorted_rules[:]

# Function to apply a single rule to a row
def apply_rule(row, conditions):
    # Regular expression to capture feature, operator, and threshold, allowing for negative values
    condition_pattern = re.compile(r"(\S+)\s*(<=|>)\s*(-?[\d.]+)")
    
    # Evaluate each condition in the rule for the row
    for condition in conditions:
        match = condition_pattern.match(condition)
        if match:
            feature, op, threshold = match.groups()
            threshold = float(threshold)
            
            # Check the row's feature value against the condition
            if op == "<=":
                if not row[feature] <= threshold:
                    return 0  # Condition not met, rule is not satisfied
            elif op == ">":
                if not row[feature] > threshold:
                    return 0  # Condition not met, rule is not satisfied
        else:
            print(f"Warning: Condition '{condition}' could not be parsed.")
            return 0  # Return 0 if the condition format is unexpected
    return 1  # All conditions in the rule are satisfied

# Create binary features for the top 10 rules
rule_features = np.zeros((X_top.shape[0], len(top_10_rules)))


# Apply each of the top 10 rules to all rows in X_top
for i, rule in enumerate(top_10_rules):
    # Assuming `conditions` is the first element in each rule tuple
    conditions = rule[0]
    rule_features[:, i] = X_top.apply(apply_rule, axis=1, args=(conditions,))


# Convert rule features to a DataFrame
df_RulesDepth9_324 = pd.DataFrame(rule_features, columns=[f"rule_{i+1}" for i in range(len(top_10_rules))])


In [97]:
df_RulesDepth9_324.to_csv("/home/kamal/Desktop/ICNS-2025/Data/RulesFromDT/df_RulesDepth15_2628.csv", index=False)
df_RulesDepth9_324 

Unnamed: 0,rule_1,rule_2,rule_3,rule_4,rule_5,rule_6,rule_7,rule_8,rule_9,rule_10,...,rule_2619,rule_2620,rule_2621,rule_2622,rule_2623,rule_2624,rule_2625,rule_2626,rule_2627,rule_2628
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Step 1: Preprocess the data

# Drop the target variable from the features
X = df.drop(columns=['pop_flows'])
y = df['pop_flows']

# Train-test split before scaling
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Normalize the data (all columns in X are numerical, so no need for 'numerical_cols')
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Normalize the target variable
y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_val = y_scaler.transform(y_val.values.reshape(-1, 1)).flatten()
y_test = y_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)


In [55]:
# Step 4: Instantiate the model, define loss function and optimizer
input_dim = X_train.shape[1]  # Number of features
nn_model = PopulationFlowNN(input_dim)

# Use MAE loss function to minimize absolute error
criterion = nn.L1Loss()

# Use Adam optimizer with weight decay (L2 regularization)
optimizer = optim.Adam(nn_model.parameters(), lr=0.0007386446079089292, weight_decay=1e-5)

# Step 5: Train the model with early stopping
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)

num_epochs = 100  # Maximum epochs for training
min_epochs = 50  # Minimum number of epochs before early stopping is considered
patience = 5  # Early stopping patience
best_val_loss = float('inf')
epochs_without_improvement = 0

for epoch in range(num_epochs):
    nn_model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = nn_model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Evaluate on validation set
    nn_model.eval()
    with torch.no_grad():
        val_outputs = nn_model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor)

    # Print progress every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {running_loss / len(train_loader):.4f}, Validation Loss: {val_loss.item():.4f}')
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0  # Reset counter if validation loss improves
    else:
        epochs_without_improvement += 1

    if epoch + 1 >= min_epochs and epochs_without_improvement >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break

# Step 6: Evaluate the model on the test set
nn_model.eval()
with torch.no_grad():
    y_pred = nn_model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Test Loss (MAE): {test_loss.item():.4f}')


Epoch [10/100], Training Loss: 0.0904, Validation Loss: 0.0984
Epoch [20/100], Training Loss: 0.0841, Validation Loss: 0.1001
Epoch [30/100], Training Loss: 0.0811, Validation Loss: 0.0954
Epoch [40/100], Training Loss: 0.0785, Validation Loss: 0.0910
Epoch [50/100], Training Loss: 0.0765, Validation Loss: 0.0924
Early stopping triggered after 50 epochs.
Test Loss (MAE): 0.0874


In [56]:
# Calculate predictions on test set
with torch.no_grad():
    y_pred = nn_model(X_test_tensor)

# Convert to numpy arrays for comparison
y_pred_np = y_pred.numpy()
y_test_np = y_test_tensor.numpy()

# Calculate MAE and R²
mae = mean_absolute_error(y_test_np, y_pred_np)
r2 = r2_score(y_test_np, y_pred_np)
cpc = sorensen_dice_index(y_test_np, y_pred_np)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")
print(f'CPC: {cpc:.4f}')

Mean Absolute Error (MAE): 0.0874246135354042
R-squared (R²): 0.7682051062583923
CPC: 0.7737
