In [21]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import KBinsDiscretizer

# Load the data
df = pd.read_csv('road_transport_records.csv')

# Feature selection
X = df.drop(['Road_ID', 'AccidentRisk'], axis=1)
y = df['AccidentRisk']
mi_scores = mutual_info_classif(X, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
selected_features = mi_scores[mi_scores > mi_scores.median()].index.tolist()

# Discretization
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_binned = pd.DataFrame(kbd.fit_transform(X[selected_features]), columns=selected_features)

# ID3 Algorithm Implementation
class Node:
    def __init__(self, attribute=None, label=None, branches=None):
        self.attribute = attribute
        self.label = label
        self.branches = branches or {}

def entropy(y):
    counter = Counter(y)
    probs = [count / len(y) for count in counter.values()]
    return -sum(p * np.log2(p) for p in probs if p > 0)

def information_gain(X, y, attribute):
    total_entropy = entropy(y)
    weighted_entropy = 0
    for value in X[attribute].unique():
        subset = y[X[attribute] == value]
        weighted_entropy += len(subset) / len(y) * entropy(subset)
    return total_entropy - weighted_entropy

def id3(X, y, attributes, max_depth=3, min_samples=5):
    if len(set(y)) == 1:
        return Node(label=y.iloc[0])
    
    if not attributes or len(y) < min_samples or max_depth == 0:
        return Node(label=Counter(y).most_common(1)[0][0])
    
    best_attribute = max(attributes, key=lambda a: information_gain(X, y, a))
    node = Node(attribute=best_attribute)
    
    for value in X[best_attribute].unique():
        subset_X = X[X[best_attribute] == value].drop(columns=[best_attribute])
        subset_y = y[X[best_attribute] == value]
        new_attributes = [a for a in attributes if a != best_attribute]
        node.branches[value] = id3(subset_X, subset_y, new_attributes, max_depth-1, min_samples)
    
    return node

def predict(node, instance):
    if node.label is not None:
        return node.label
    value = instance[node.attribute]
    if value not in node.branches:
        return max(Counter(df['AccidentRisk']).items(), key=lambda x: x[1])[0]
    return predict(node.branches[value], instance)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_binned, y, test_size=0.2, random_state=42)

# Train the model
attributes = list(X_train.columns)
root = id3(X_train, y_train, attributes, max_depth=3)

# Make predictions
y_pred = [predict(root, row) for _, row in X_test.iterrows()]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print summary and results
print("Selected Features:", selected_features)
print("Data Summary:")
print(df[selected_features + ['AccidentRisk']].describe())
print("\nDecision Tree Accuracy:", accuracy)
print("\nComparison of Predicted vs Original Values:")
comparison = pd.DataFrame({'Original': y_test, 'Predicted': y_pred})
print(comparison.head(20))


Selected Features: ['Number of Bends']
Data Summary:
       Number of Bends
count        50.000000
mean          5.100000
std           1.897904
min           2.000000
25%           4.000000
50%           5.000000
75%           7.000000
max           8.000000

Decision Tree Accuracy: 1.0

Comparison of Predicted vs Original Values:
   Original Predicted
13     High      High
39   Medium    Medium
30     High      High
45      Low       Low
17      Low       Low
48     High      High
26      Low       Low
25   Medium    Medium
32   Medium    Medium
19   Medium    Medium


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('road_transport_records.csv')

# Prepare the data
X = df.drop(['Road_ID', 'AccidentRisk'], axis=1)
y = df['AccidentRisk']

# Encode categorical variables
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model using sklearn's DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Visualize the decision tree using matplotlib
plt.figure(figsize=(20,10))
plot_tree(clf, 
          feature_names=X.columns,  
          class_names=le.classes_,
          filled=True,
          rounded=True,
          fontsize=10)
plt.title("Decision Tree for Accident Risk", fontsize=20)
plt.savefig("decision_tree.png", dpi=300, bbox_inches="tight")
plt.close()