In [3]:
# !pip install csrgraph

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from utils import load_dataset, create_graph_from_dataframe, fit_and_evaluate, grid_search_embedding_size
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# انتخاب مجموعه داده
dataset_name = "CA"   # یا "MHD"
embedding_sizes = [2,5] # [8, 16, 32, 64]


In [5]:
df, numeric_features, threshold = load_dataset(dataset_name)
G = create_graph_from_dataframe(df, numeric_features, threshold)

print(f"Dataset {dataset_name}: {df.shape[0]} samples, {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
df.head()


Building graph: 100%|██████████| 534477/534477 [00:12<00:00, 42167.60it/s]


Dataset CA: 20433 samples, 20433 nodes, 534477 edges


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,price,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,id
0,-122.23,37.88,0.982163,-0.803813,-0.970325,-0.97332,-0.976833,2.345163,452600,False,False,True,False,0
1,-122.22,37.86,-0.60621,2.04213,1.348276,0.861339,1.670373,2.332632,358500,False,False,True,False,1
2,-122.24,37.85,1.855769,-0.535189,-0.825561,-0.819769,-0.843427,1.782939,352100,False,False,True,False,2
3,-122.25,37.85,1.855769,-0.62351,-0.718768,-0.765056,-0.733562,0.93297,341300,False,False,True,False,3
4,-122.25,37.85,1.855769,-0.46197,-0.611974,-0.758879,-0.62893,-0.013143,342200,False,False,True,False,4


In [6]:
X_base = df.drop(['price', 'id'], axis=1)
y = df['price']
X_train_base, X_test_base, y_train, y_test = train_test_split(X_base, y, test_size=0.1, random_state=42)

results = []
for model_name, model in [
    ("GradientBoosting", GradientBoostingRegressor(random_state=42)),
    ("LinearRegression", LinearRegression()),
    ("RandomForest", RandomForestRegressor(random_state=42)),
]:
    metrics = fit_and_evaluate(model, X_train_base, y_train, X_test_base, y_test, verbose=True)
    results.append([f"{model_name} (Raw)", *metrics])


R2: 0.787, MAPE: 0.211, RMSE: 54214.653, Acc: 0.607
R2: 0.662, MAPE: 0.287, RMSE: 68215.591, Acc: 0.473
R2: 0.838, MAPE: 0.166, RMSE: 47200.554, Acc: 0.727


In [None]:
best_dw_size, X_dw, y_dw, dw_results = grid_search_embedding_size(df, G, embedding_sizes, method="deepwalk", dataset_name=dataset_name)
dw_results

[deepwalk] Evaluating embedding size: 2


DeepWalk Nodes: 100%|██████████| 20433/20433 [09:52<00:00, 34.51it/s] 


[deepwalk] Evaluating embedding size: 5


DeepWalk Nodes:  14%|█▍        | 2840/20433 [01:23<08:19, 35.22it/s]

In [None]:
plt.plot(dw_results["Embedding Size"], dw_results["rmse"], marker="o")
plt.axvline(best_dw_size, color="r", linestyle="--", label=f"Best={best_dw_size}")
plt.title("DeepWalk - Embedding Size vs RMSE")
plt.xlabel("Embedding Size")
plt.ylabel("RMSE")
plt.legend()
plt.show()

In [None]:
best_n2v_size, X_n2v, y_n2v, n2v_results = grid_search_embedding_size(df, G, embedding_sizes, method="node2vec", dataset_name=dataset_name)
n2v_results

In [None]:
plt.plot(n2v_results["Embedding Size"], n2v_results["rmse"], marker="o")
plt.axvline(best_n2v_size, color="r", linestyle="--", label=f"Best={best_n2v_size}")
plt.title("Node2Vec - Embedding Size vs RMSE")
plt.xlabel("Embedding Size")
plt.ylabel("RMSE")
plt.legend()
plt.show()


In [None]:
for (label, X, y) in [
    (f"DeepWalk-{best_dw_size}", X_dw, y_dw),
    (f"Node2Vec-{best_n2v_size}", X_n2v, y_n2v),
]:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    for model_name, model in [
        ("GradientBoosting", GradientBoostingRegressor(random_state=42)),
        ("LinearRegression", LinearRegression()),
        ("RandomForest", RandomForestRegressor(random_state=42)),
    ]:
        metrics = fit_and_evaluate(model, X_train, y_train, X_test, y_test, verbose=True)
        results.append([f"{model_name} ({label})", *metrics])

In [None]:
results_df = pd.DataFrame(results, columns=["Model", "R2", "MAPE", "Accuracy", "RMSE", "MSE_log"])
results_df
results_df.to_excel(f"results/{dataset_name}/final_results_notebook.xlsx", index=False)
