In [9]:
# First cell: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# Optional
import warnings
warnings.filterwarnings('ignore')


## 1. Data Preprocessing, Scaling, and Encoding

We first load the dataset, check for missing values or irregularities, and apply scaling where needed. 
Since the dataset is numerical and clean, no encoding is needed. Only scaling is performed where necessary 
(for example, SVM benefits from feature scaling).

We assume the target variable is `conflict` (0 or 1).


In [19]:
# Load dataset
df = pd.read_csv("MergeConflictsDataset.csv")  # Replace with your actual CSV file name

# Drop identifiers and keep only relevant features
df = df.drop(columns=["commit", "parent1", "parent2", "ancestor"])

# Check for missing values
print(df.isnull().sum())

# Feature and target separation
X = df.drop("conflict", axis=1)
y = df["conflict"]




is pr              0
added lines        0
deleted lines      0
devs parent1       0
devs parent2       0
time               0
nr files           0
added files        0
deleted files      0
renamed files      0
copied files       0
modified files     0
nr commits1        0
nr commits2        0
density1           0
density2           0
fix                0
bug                0
feature            0
improve            0
document           0
refactor           0
update             0
add                0
remove             0
use                0
delete             0
change             0
messages_min       0
messages_max       0
messages_mean      0
messages_median    0
conflict           0
dtype: int64


## 2. Train/Test Split (with Stratification)

To ensure balanced class distribution in training and test sets, we use stratified splitting. This helps in
cases where classes are imbalanced.


In [16]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features for models like SVM
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)
print(X_train.shape)


(21578, 32)


## 3. Performance Metric Selection

We evaluate using accuracy, precision, recall, F1-score, and ROC-AUC. Since this is a binary classification
task, ROC-AUC is particularly useful to evaluate how well the model separates the two classes.


## 4. Feature Selection

We can use feature importance from Decision Trees to identify top features. This helps build interpretable
and slim models, especially useful when performance is close to the full model.


In [12]:
# Train a basic decision tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Get feature importances (using original column names)
importances = pd.Series(tree.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)
print("Top 10 important features:")
print(top_features)

# Convert scaled arrays back to DataFrames for column name access
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

# Slice by top features
top_feature_names = top_features.index.tolist()
X_train_slim = X_train_df[top_feature_names].values
X_test_slim = X_test_df[top_feature_names].values

Top 10 important features:
nr files           0.347054
is pr              0.161846
nr commits2        0.041396
messages_min       0.035067
messages_max       0.033217
deleted lines      0.030806
messages_median    0.028409
messages_mean      0.027895
time               0.027664
add                0.026304
dtype: float64


## 5. Model Selection, Evaluation, Optimization

We evaluate three models: Decision Tree, SVM (or Naive Bayes), and an ensemble method (Random Forest or AdaBoost).
For each, we perform hyperparameter tuning using GridSearchCV and avoid overfitting by using validation metrics.


In [13]:
tree_params = {'max_depth': [3, 5, 10, None]}
grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), tree_params, cv=5)
grid_tree.fit(X_train, y_train)
y_pred_tree = grid_tree.predict(X_test)

print("Best Tree Params:", grid_tree.best_params_)
print(classification_report(y_test, y_pred_tree))


Best Tree Params: {'max_depth': 5}
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5101
           1       0.67      0.61      0.64       294

    accuracy                           0.96      5395
   macro avg       0.82      0.80      0.81      5395
weighted avg       0.96      0.96      0.96      5395



In [7]:
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_svm = GridSearchCV(SVC(probability=True), svm_params, cv=5)
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)

print("Best SVM Params:", grid_svm.best_params_)
print(classification_report(y_test, y_pred_svm))


Best SVM Params: {'C': 1, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5101
           1       0.76      0.31      0.44       294

    accuracy                           0.96      5395
   macro avg       0.86      0.65      0.71      5395
weighted avg       0.95      0.96      0.95      5395



In [8]:
rf_params = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_test)

print("Best RF Params:", grid_rf.best_params_)
print(classification_report(y_test, y_pred_rf))


Best RF Params: {'max_depth': None, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5101
           1       0.75      0.56      0.64       294

    accuracy                           0.97      5395
   macro avg       0.86      0.78      0.81      5395
weighted avg       0.96      0.97      0.96      5395



## 6. New Feature Proposals

We propose the following derived features:
- **commit_activity_diff** = abs(nr_commits1 - nr_commits2)
- **keyword_sum** = sum of all keyword-related counts (fix, bug, feature, ...)
- **message_length_range** = messages_max - messages_min

We re-train the best model with these features added to the dataset.


In [9]:
df["commit_activity_diff"] = abs(df["nr commits1"] - df["nr commits2"])
df["keyword_sum"] = df[["fix","bug","feature","improve","document","refactor","update","add","remove","use","delete","change"]].sum(axis=1)
df["message_length_range"] = df["messages_max"] - df["messages_min"]

# Add to X and re-scale
X_new = df.drop("conflict", axis=1)
X_new_scaled = scaler.fit_transform(X_new)

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_new_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Retrain best model (e.g., Random Forest)
grid_rf.fit(X_train_new, y_train_new)
y_pred_new = grid_rf.predict(X_test_new)

print("Performance with new features:")
print(classification_report(y_test_new, y_pred_new))


Performance with new features:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      5101
           1       0.76      0.56      0.65       294

    accuracy                           0.97      5395
   macro avg       0.87      0.78      0.82      5395
weighted avg       0.96      0.97      0.96      5395

