In [1]:
import pandas as pd  # For data handling
import numpy as np  # For numerical operations
import networkx as nx  # For graph algorithms
import ast  # To safely parse edge list strings into Python lists
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For advanced plots
from sklearn.model_selection import GroupKFold, GridSearchCV  # Grouped cross-validation and hyperparameter tuning
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [5]:
# read training data
df = pd.read_csv('data/train.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.csv'

## Inspect the Data

We inspect the train dataset to check the structure of the data and understand the dataset more. We check the summary statistics and look out for possible errors, missing values, outliers or duplicates. We also check the the datatypes of each variable and ensure they are of the proper type. 

In [None]:
df.head()

In [None]:
# get number of rows and columns
df.shape

Target variable: root. 
It says the node that is the root. But to make the task simpler, we will turn it to a binary classification task. Such that, each row takes a node and the target variable will the is_root, where 0 will indicate that the node isn't the root and 1 indicates that it is the root.

This will cause data imbalance in the dataset because most of the classes will be 0. 

In [None]:
# check other variables in the data
df.columns

In [None]:
# Basic info
df.info()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Check for duplicates
df.duplicated().sum()

Training data contains no missing values or duplicated columns, which is good


In [None]:
# check the data types
df.dtypes

In [None]:
# Confirm the type of object
type(df['edgelist'].iloc[0])

Edgelist is of the datatype object (string).

It needs to be converted to a python edgelist which can be used to create the networkx tree

In [None]:
df.describe()

There are sentences with 3 nodes (words) and some with as many as 70 nodes. This should be taken into consideration when normalizing. it will be advisable to normalize per sentence coz of this imbalance

In [None]:
# Sentence length distribution
plt.figure(figsize=(10,5))
sns.histplot(data=df, x='n', bins=30, kde=True)
plt.title('Distribution of Tree Sizes (Number of Nodes)')
plt.xlabel('Number of nodes')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Language distribution (check number of languages and number of sentences per language)
lang = df['language'].nunique()
print(f'There are {lang} languages. Each language has the following number of sentences:')
df['language'].value_counts()

In [None]:
# Root node analysis
print(df['root'].describe())
plt.figure(figsize=(10,5))
sns.histplot(data=df, x='root', bins=30)
plt.title('Distribution of Root Nodes')
plt.show()

In [None]:
def visualize_tree(edges, root, title=""):
    G = nx.DiGraph()
    G.add_edges_from(edges)
    
    pos = nx.planar_layout(G) if nx.is_planar(G) else nx.spring_layout(G)
    
    plt.figure(figsize=(8, 6))
    nx.draw(G, pos, with_labels=True, arrows=True, node_size=800, node_color='lightblue')
    plt.title(f"Tree Visualization (Root: {root}) {title}")
    plt.show()

# Visualize first few trees
for i, row in df.head(3).iterrows():
    visualize_tree(ast.literal_eval(row['edgelist']), row['root'], f"Language: {row['language']}, Sentence: {row['sentence']}")

## Data Preprocessing

In [None]:
import ast
import pandas as pd
import networkx as nx
import numpy as np
from scipy import stats
from collections import Counter

def preprocess(df):
    """
    Enhanced preprocessing with additional features for tree root prediction
    """
    
    df['edgelist'] = df['edgelist'].apply(ast.literal_eval)
    
    def enhanced_features(edgelist):
        """
        Extract comprehensive features for tree root prediction
        """
        T = nx.from_edgelist(edgelist)
        
        # Basic centrality measures (your existing ones)
        dc = nx.degree_centrality(T)
        cc = nx.harmonic_centrality(T)
        bc = nx.betweenness_centrality(T)
        pc = nx.pagerank(T)
        clc = nx.closeness_centrality(T)
        kz = nx.katz_centrality_numpy(T, alpha=0.005, beta=1.0)
        lc = nx.load_centrality(T)
        andc = nx.average_neighbor_degree(T)
        
        # NEW FEATURES FOR ROOT PREDICTION
        
        
        # 2. Distance-based features
        # Average distance to all other nodes
        shortest_paths = dict(nx.all_pairs_shortest_path_length(T))
        avg_distances = {}
        sum_distances = {}
        
        for node in T.nodes():
            distances = list(shortest_paths[node].values())
            avg_distances[node] = np.mean(distances)
            sum_distances[node] = sum(distances)
        
        # 3. Tree-specific features
        # Distance from center(s) of the tree
        center_nodes = nx.center(T)
        periphery_nodes = nx.periphery(T)
        
        distance_from_center = {}
        distance_from_periphery = {}
       
        for node in T.nodes():
            # Distance to closest periphery
            distance_from_periphery[node] = min(nx.shortest_path_length(T, node, periph) 
                                              for periph in periphery_nodes)
      
        # 4. Subtree size features
        # For each node, compute size of subtree when that node is removed
        subtree_sizes = {}
        for node in T.nodes():
            T_copy = T.copy()
            T_copy.remove_node(node)
            components = list(nx.connected_components(T_copy))
            # Size of largest component when node is removed
            subtree_sizes[node] = max(len(comp) for comp in components) if components else 0
        
        # 5. Neighbor-based features
        neighbor_degrees = {}
        neighbor_centralities = {}
        second_order_neighbors = {}
        
        for node in T.nodes():
            neighbors = list(T.neighbors(node))
            if neighbors:
                neighbor_degrees[node] = np.mean([T.degree(n) for n in neighbors])
                neighbor_centralities[node] = np.mean([dc[n] for n in neighbors])
                # Second-order neighbors (neighbors of neighbors)
                second_neighbors = set()
                for neighbor in neighbors:
                    second_neighbors.update(T.neighbors(neighbor))
                second_neighbors.discard(node)  # Remove self
                second_order_neighbors[node] = len(second_neighbors)
            else:
                neighbor_degrees[node] = 0
                neighbor_centralities[node] = 0
                second_order_neighbors[node] = 0
        
        
        # 7. Relative position features
        # Node's degree relative to max degree
        max_degree = max(dict(T.degree()).values())
        relative_degree = {node: T.degree(node) / max_degree for node in T.nodes()}
        
        # Node's centrality relative to max centrality
        max_dc = max(dc.values())
        relative_centrality = {node: dc[node] / max_dc for node in T.nodes()}
        
        # 8. Tree depth features (if we can infer a root)
        # Use the most central node as a proxy root for depth calculation
        proxy_root = max(dc.keys(), key=lambda x: dc[x])
        depths_from_proxy = nx.single_source_shortest_path_length(T, proxy_root)
        
        # 9. Bridge and articulation point features
        bridges = list(nx.bridges(T))
        articulation_points = set(nx.articulation_points(T))
        
        is_articulation = {node: 1 if node in articulation_points else 0 for node in T.nodes()}
        bridge_count = {node: sum(1 for bridge in bridges if node in bridge) for node in T.nodes()}
        
        # 10. Statistical features within neighborhoods
        local_degree_variance = {}
        
        for node in T.nodes():
            neighbors = list(T.neighbors(node))
            if len(neighbors) > 1:
                neighbor_degrees_list = [T.degree(n) for n in neighbors]
                local_degree_variance[node] = np.var(neighbor_degrees_list)
            else:
                local_degree_variance[node] = 0
        
        # Combine all features
        features = {}
        for node in T.nodes():
            features[node] = (
                # Original centrality features
                dc[node], cc[node], bc[node], pc[node], clc[node], 
                kz[node], lc[node], andc[node],
                
                # New features
                avg_distances[node],               # 10: Average distance to all nodes
                sum_distances[node],               # 12: Sum of distances
                distance_from_periphery[node],     # 14: Distance from periphery
                subtree_sizes[node],               # 17: Size of largest subtree when removed
                neighbor_degrees[node],            # 18: Average neighbor degree
                neighbor_centralities[node],       # 19: Average neighbor centrality
                second_order_neighbors[node],      # 20: Number of second-order neighbors
                relative_degree[node],             # 22: Relative degree
                relative_centrality[node],         # 23: Relative centrality
                depths_from_proxy[node],           # 24: Depth from proxy root
                is_articulation[node],             # 25: Is articulation point
                bridge_count[node],                # 26: Number of bridges connected to
                local_degree_variance[node],       # 28: Local degree variance
                
            )
        
        return features
    
    df['centralities'] = df['edgelist'].apply(enhanced_features)
    
    def binary_classification(df):
        """
        Convert to binary classification format with enhanced features
        """
        records = []
        has_root = 'root' in df.columns
        
        for _, row in df.iterrows():
            for vertex, feature_tuple in row['centralities'].items():
                record = {
                    'language': row['language'],
                    'sentence': row['sentence'],
                    'n': row['n'],
                    'vertex': vertex,
                }
                
                # Add all features with descriptive names
                feature_names = [
                    'degree', 'harmonic', 'betweenness', 'pagerank', 'closeness',
                    'katz', 'load', 'avg_neighbor_degree',
                     'avg_distance', 'sum_distance',
                    'distance_from_periphery', 
                    'subtree_size',
                    'neighbor_degrees', 'neighbor_centralities', 'second_order_neighbors',
                     'relative_degree', 'relative_centrality',
                    'depth_from_proxy', 'is_articulation', 'bridge_count',
                    'local_degree_variance'
        
                ]
                
                for i, feature_name in enumerate(feature_names):
                    record[feature_name] = feature_tuple[i]
                
                if has_root:
                    record['root'] = row['root']
                records.append(record)
        
        binary_df = pd.DataFrame(records)
        binary_df = binary_df.sort_values(['language', 'sentence', 'vertex']).reset_index(drop=True)
        
        if has_root:
            binary_df['is_root'] = (binary_df['vertex'] == binary_df['root']).astype(int)
        
        return binary_df
    
    return binary_classification(df)

In [None]:
df = preprocess(df)

# check with project guideline
df[(df['language'] == 'Arabic') & (df['sentence'] == 62)]

#n number of nodes, 

In [None]:
#check columns where all values are 0
df.columns[(df == 0).all()]


In [None]:
# 4. Analyze Centrality Distributions
plt.figure(figsize=(12,8))
for feature in ['degree', 'harmonic', 'betweenness', 'pagerank']:
    sns.kdeplot(df[feature], label=feature)
plt.title('Centrality Distributions')
plt.legend()
plt.xlim(0, 1)  # Limit x-axis from 0 to 5
plt.show()


In [None]:
# drop root column
df = df.drop('root', axis=1)

## Data Exploration of the Expanded Data

In [None]:
# save expanded data
df.to_csv('binary_dataset.csv', index=False)

In [None]:
train_df = pd.read_csv('binary_dataset.csv')

In [None]:
train_df.info()

- change language to categorical variable


In [None]:
train_df['language'] = train_df['language'].astype('category')

In [None]:
# get number of rows and columns
train_df.shape

In [None]:
# confirm there are no missing values
train_df.isnull().sum()

In [None]:
# Check for duplicates
train_df.duplicated().sum()

In [None]:
train_df.describe()

In [None]:
# check the carrelation between features
plt.figure(figsize=(10, 8))
sns.heatmap(df.drop(['sentence', 'language'], axis=1).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
import pandas as pd

# Drop non-numeric columns
correlation_matrix = df.drop(['sentence', 'language'], axis=1).corr()

# Display as table
print(correlation_matrix.round(2))
correlation_matrix.round(2).to_csv("correlation_matrix.csv")



## Resampling: Splitting Data into Train and Validation Set

To estimate the generalization error, we split the data into train and validation set. We will then use a 5-fold cross validation over the train set so we train over a good sample of the data

In [None]:
# separate features and target value
X = train_df.drop('is_root', axis=1)
y = train_df['is_root']


In [None]:
# to avoid data leakage use group split
from sklearn.model_selection import GroupShuffleSplit

# Grouping by sentence ID
groups = train_df['sentence'] 

# Perform group split (80% train, 20% val)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Verify no sentence appears in both splits
train_sentences = set(groups.iloc[train_idx])
val_sentences = set(groups.iloc[val_idx])
assert train_sentences.isdisjoint(val_sentences), "Data leakage detected!"

## Per Sentence Normalization

It will be better to normalize the data since it has different different ranges. we have to do a sentence level normalization because different sentences have different number of nodes

Note: Group by language then sentence coz per sentence will scale all the sentences from all the languages as one. however for example, Arabic and Turkish trees for the same sentence may have different structures, so we Normalize per-language AND per-sentence by grouping by both language and sentence. treating them as a single group would mix languages during normalization.

In [None]:
from sklearn.preprocessing import MinMaxScaler

features = X_train.drop(['language', 'sentence', 'n', 'vertex'], axis=1).columns

def scaling(X):
    """
    Normalize features using MinMaxScaler
    """
    normalized_ft = X.groupby(['language','sentence'])[features].transform(lambda x: MinMaxScaler().fit_transform(x.values.reshape(-1,1)).flatten())
    X_ft= X.drop(features, axis=1) 
    normalized_X = pd.concat([X_ft, normalized_ft], axis=1)
    
    return normalized_X

X_train_normalized = scaling(X_train)
X_val_normalized = scaling(X_val)

we fit and transform the test set independently because the sentences are different in the two sets, so we can't use the scalers used for train to transform the test set, as we performed a within sentence normalization

We also only performed normalization on the centrality metrics and on the number of nodes n

In [None]:
# drop columns not useful
X_train_normalized = X_train_normalized.drop(['language', 'sentence', 'vertex'], axis=1)
X_val_normalized = X_val_normalized.drop(['language', 'sentence', 'vertex'], axis=1)

# Imbalance Handling 

We tried undersmapling and oversampling the data but it was not as helpful, so it will remain commented here

## Undersampling 

In [None]:
"""
import ast
import networkx as nx
from tqdm import tqdm

# Function to remove bottom 25% nodes by degree
def prune_low_degree_nodes(edgelist_str, retain_percent=0.75):
    edges = ast.literal_eval(edgelist_str)
    G = nx.Graph(edges)

    if len(G.nodes) == 0:
        return edges  # nothing to do

    # Sort nodes by degree
    degrees = dict(G.degree())
    sorted_nodes = sorted(degrees.items(), key=lambda x: x[1])
    
    # Keep top N% nodes
    keep_n = int(len(sorted_nodes) * retain_percent)
    keep_nodes = set([node for node, _ in sorted_nodes[-keep_n:]])
    
    # Filter edges
    pruned_edges = [(u, v) for u, v in edges if u in keep_nodes and v in keep_nodes]
    
    return pruned_edges

tqdm.pandas()
X_train_normalized = X_train_normalized.copy()
X_train_normalized['edgelist'] = X_train_normalized.progress_apply(
    lambda row: prune_low_degree_nodes(row['edgelist']) if row['language'] == major_class else row['edgelist'],
    axis=1
)
)
"""

## Oversampling

In [None]:
"""
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Step 1: Separate features and target
X_features = X_train_normalized.drop(columns=['target', 'language', 'sentence', 'node', 'group'])  # Keep only numerical features
y_target = train_df['target']

# Save identifier columns for later merge
identifier_cols = train_df[['language', 'sentence', 'node']].reset_index(drop=True)

# Step 2: Apply RandomOverSampler to balance the minority class (target == 1)
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_balanced, y_balanced = ros.fit_resample(X_features, y_target)

# Step 3: Reattach metadata using sample indices
resampled_ids = identifier_cols.iloc[ros.sample_indices_].reset_index(drop=True)
resampled_data = pd.concat([
    resampled_ids,
    pd.DataFrame(X_balanced, columns=X_features.columns),
    pd.Series(y_balanced, name='target')
], axis=1)
"""


## Modeling

### Linear Classifiers

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score

lg_model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1],             # Regularization strength
    'penalty': ['l2'],                  # l1 requires solver='liblinear' or 'saga'
    'solver': ['lbfgs', 'liblinear'],                # 'lbfgs' supports l2 and multiclass
}

grid_search = GridSearchCV(
    estimator=lg_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',   
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])


# Train
logreg = grid_search.best_estimator_

# Predict
y_pred = logreg.predict(X_val_normalized)
y_probs = logreg.predict_proba(X_val_normalized)[:, 1]  # Probabilities for root class

In [None]:
# Evaluation
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score:", f1_score(y_val, y_pred))


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

lda = LinearDiscriminantAnalysis()

param_grid = {
    'shrinkage': [None, 'auto'],  # You can also try float values with solver='lsqr'
    'solver': ['svd', 'lsqr']
}

grid_search = GridSearchCV(
    estimator=lda,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

# Best model
lda_best = grid_search.best_estimator_

# Predictions
y_pred = lda_best.predict(X_val_normalized)
y_probs = lda_best.predict_proba(X_val_normalized)[:, 1]  # Probability of positive class


In [None]:
from sklearn.metrics import classification_report, f1_score

# Evaluation for LDA
print("📌 Best Hyperparameters:", grid_search.best_params_)

print("\n📊 Classification Report (LDA):")
print(classification_report(y_val, y_pred, digits=4))

f1 = f1_score(y_val, y_pred)
print(f"\n🎯 F1 Score on Validation Set (LDA): {f1:.4f}")


## Non Linear Classifiers

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score

# Define KNN model
knn_model = KNeighborsClassifier()

# Hyperparameter grid for KNN
param_grid = {
    'n_neighbors': [1, 3, 5],
    'metric': ['euclidean', 'minkowski', 'manhattan']
}

# Grid search with GroupKFold
grid_search = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

# Fit model
grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

# Best estimator
knn = grid_search.best_estimator_

# Predict
y_pred = knn.predict(X_val_normalized)

# Probabilities (only works if n_neighbors > 1)
y_probs = knn.predict_proba(X_val_normalized)[:, 1]

# Evaluation
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score:", f1_score(y_val, y_pred))


In [None]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(random_state=42, max_iter=500)

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'solver': ['adam']
}

grid_search = GridSearchCV(
    estimator=mlp_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_val_normalized)
y_probs = best_mlp.predict_proba(X_val_normalized)[:, 1]

print("Best Parameters (MLP):", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score (MLP):", f1_score(y_val, y_pred))


In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda_model = QuadraticDiscriminantAnalysis()

param_grid = {
    'reg_param': [0.0, 0.1, 0.5]  # Regularization
}

grid_search = GridSearchCV(
    estimator=qda_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

best_qda = grid_search.best_estimator_
y_pred = best_qda.predict(X_val_normalized)
y_probs = best_qda.predict_proba(X_val_normalized)[:, 1]

print("Best Parameters (QDA):", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score (QDA):", f1_score(y_val, y_pred))


In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()

# No real hyperparameters to tune in GNB, but wrap for consistency
param_grid = {}

grid_search = GridSearchCV(
    estimator=gnb_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

best_gnb = grid_search.best_estimator_
y_pred = best_gnb.predict(X_val_normalized)
y_probs = best_gnb.predict_proba(X_val_normalized)[:, 1]

print("Best Parameters (GNB):", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score (GNB):", f1_score(y_val, y_pred))


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(class_weight='balanced', random_state=42)

param_grid = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10]
}

grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_normalized, y_train, groups=X_train['sentence'])

best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_val_normalized)
y_probs = best_dt.predict_proba(X_val_normalized)[:, 1]

print("Best Parameters (Decision Tree):", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score (Decision Tree):", f1_score(y_val, y_pred))


## Ensemble Methods

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline

# Initialize Random Forest with balanced class weights
rf_model = RandomForestClassifier(
    class_weight='balanced_subsample',  # Handles imbalance
    random_state=42,
    n_jobs=-1  # Parallel processing
)

# Parameter grid for tuning
param_grid = {
    'n_estimators': [200, 250],       # Number of trees
    'max_depth': [20, 30, 50],      # Tree depth
    'min_samples_split': [5, 10],      # Minimum samples to split
    'min_samples_leaf': [6, 10],
}

# Group-aware GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=GroupKFold(n_splits=5),  # 10-fold grouped CV
    scoring='f1',                # Focus on F1 for root class
    n_jobs=-1,                   # Parallelize
    verbose=1
)

# Fit (ensure X_train doesn't contain 'sentence' column)
grid_search.fit(
    X_train_normalized,  # Exclude group identifier
    y_train,
    groups=X_train['sentence']  # Grouping key
)

# Best model
best_rf = grid_search.best_estimator_

# Predictions
y_pred = best_rf.predict(X_val_normalized)
y_probs = best_rf.predict_proba(X_val_normalized)[:, 1]

# Evaluation
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_val, y_pred))
print("Test F1 Score:", f1_score(y_val, y_pred))

# Feature Importance
importances = best_rf.feature_importances_
print("Feature Importances:", dict(zip(X_train_normalized, importances)))

## Fit the Chosen Model on Entire Train Test

In [None]:
X_normalized = scaling(X).drop(['language', 'sentence', 'vertex'], axis=1)
X_normalized.head(1)

In [None]:
# Fit whole data
best_rf.fit(X_normalized, y)

## Make Predictions on Test Set

In [None]:
# read test data
test_df = pd.read_csv('test.csv')
X_test = preprocess(test_df)
test_normalized = scaling(X_test).drop(['language', 'sentence', 'vertex'], axis=1)

# Predictions
test_pred = best_rf.predict(test_normalized)
test_probs = best_rf.predict_proba(test_normalized)[:, 1]

X_test['probability'] = test_probs

df_max = X_test.loc[X_test.groupby(['language', 'sentence'])['probability'].idxmax()]

df_max = df_max[['language','sentence', 'vertex']].rename(columns={'vertex': 'root'}).reset_index(drop=True)


submission_df = test_df.merge(df_max, on=['language', 'sentence'], how='left')
submission_df = submission_df[['id', 'root']]

submission_df

Feature Importance for Random Forests

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get feature importances and names
importances = best_rf.feature_importances_
feature_names = test_normalized.columns  # Make sure to match this with your training feature names

# Create a DataFrame for better plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))  # Show top 20
plt.title('Top Feature Importances (Random Forest)')
plt.tight_layout()
plt.show()


In [None]:

submission_df.to_csv('submission_randomforest_features.csv', index=False)
print("Submission file created: submission_randomforest_features.csv")


accuracy shouldnt be used as a metric when there is class imbalance