In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import networkx as nx  
from scipy.spatial.distance import pdist, squareform  
from tqdm import tqdm  
from pyproj import Transformer, CRS  
# import seaborn as sns
# import regex as re

# from sklearn.neighbors import KernelDensity
# pd.options.display.max_columns = None
# from sklearn import preprocessing

In [3]:

from sklearn.model_selection import train_test_split #, cross_validate 
# from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression #, Ridge  , LogisticRegression , QuantileRegressor
from sklearn.ensemble import GradientBoostingRegressor # , RandomForestRegressor , HistGradientBoostingRegressor 
from sklearn.metrics import mean_absolute_percentage_error , r2_score, mean_squared_error
# from xgboost import XGBRegressor
# from geopy.geocoders import Nominatim
# from concurrent.futures import ThreadPoolExecutor

###########################################################################################################################
from gensim.models import Word2Vec
# from gensim.scripts.glove2word2vec import glove2word2vec
# from gensim.models import KeyedVectors
# import re
# from sklearn.decomposition import PCA
# from matplotlib import pyplot as plt
# from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import json
import plotly.express as px
import networkx as nx
from geopy.distance import geodesic
from scipy.spatial import cKDTree

# Load the real estate dataset for Mashhad, Iran

In [3]:
data= pd.read_excel('Mashhad_Real_Estate_Data.xlsx')
data.head()

Unnamed: 0,type,area_sq_m,latitude,longitude,age_years,floor_number,number_of_bedrooms,price,region,elevator,...,stone_façade,garden,direction,furnished,renovated,closet,em_mos,naghashi,property_document_type,utilities_included
0,0,73.0,36.373776,59.47582,11,1,2,2200000000,12,1,...,0,0,0,0,0,0,0,0,1,4
1,0,100.0,36.32767,59.54838,7,1,2,4800000000,2,1,...,0,0,0,0,0,0,0,0,0,3
2,0,140.0,36.310605,59.504477,0,2,2,10200000000,9,1,...,0,1,1,0,0,0,0,0,1,4
3,0,65.0,36.40054,59.388382,6,4,1,870000000,12,0,...,0,0,0,0,0,1,1,1,2,0
4,0,200.0,36.373776,59.47582,0,0,3,5350000000,12,1,...,0,0,1,0,0,0,0,0,1,4


In [4]:
data.columns

Index(['type', 'area_sq_m', 'latitude', 'longitude', 'age_years',
       'floor_number', 'number_of_bedrooms', 'price', 'region', 'elevator',
       'parking', 'storage', 'shop_or_space', 'balcony', 'full_amenities',
       'parquet', 'ceramic_flooring', 'stone_façade', 'garden', 'direction',
       'furnished', 'renovated', 'closet', 'em_mos', 'naghashi',
       'property_document_type', 'utilities_included'],
      dtype='object')

In [5]:
data.shape

(44781, 27)

In [6]:
# Check if there are any NaN values in the DataFrame
has_nan = data.isnull().any().any()
print(has_nan)  # True if there are any NaN values

False


## Filter the data to focus on properties within a specific geographic region of Mashhad
### The latitude and longitude ranges likely correspond to the city's central or most relevant area

In [7]:
filtered_data = data[(data['longitude'] >= 59.45) & (data['longitude'] <= 59.7) & 
                 (data['latitude'] >= 36.2) & (data['latitude'] <= 36.4)]
filtered_data.shape

(44670, 27)

Randomly sample 10,000 data points from the filtered dataset

This creates a smaller, manageable dataset for analysis while maintaining diversity and reducing potential bias from any inherent order in the original data

In [8]:
np.random.seed(42)
shuffle_indices = np.random.choice(np.arange(filtered_data.shape[0]), size=10000, replace=False,)
df = filtered_data.iloc[shuffle_indices].reset_index(drop=True)
len(df) #, df.head(2)

10000

Visualize the spatial distribution of the selected properties using a scatter plot

In [9]:
import plotly.express as px
px.scatter(df , x="longitude", y="latitude",  width=400, height=400)#.write_image("img/spatial_distribution.png")

Creates a NetworkX graph of houses in Mashhad

In [10]:
import numpy as np  
import pandas as pd  
import networkx as nx  
from pyproj import CRS, Transformer  
from scipy.spatial.distance import pdist, squareform  
from tqdm import tqdm  # For progress bar  

def create_mashhad_house_graph(df, distance_threshold_meters=200):  
    """  
    Creates a NetworkX graph of houses in Mashhad, using UTM projection.  

    Args:  
        df: Pandas DataFrame with 'latitude', 'longitude', and 'type' columns.  
        distance_threshold_meters: Maximum distance (in meters) between houses to be connected.  

    Returns:  
        A NetworkX graph, or None if an error occurs.  
    """  
    # Define input and output CRS (Coordinate Reference Systems)  
    in_crs = CRS.from_epsg(4326)  # WGS 84 (lat/long)  
    out_crs = CRS.from_epsg(32640)  # UTM zone 40N  

    transformer = Transformer.from_crs(in_crs, out_crs, always_xy=True)  

    # Initialize an empty graph  
    G = nx.Graph()  

    # Transform coordinates to UTM  
    coords_utm = np.array([transformer.transform(lon, lat) for lat, lon in zip(df['longitude'], df['latitude'])])  # longitude, then latitude  

    # Calculate Euclidean distances in meters  
    distances = squareform(pdist(coords_utm))  

    # Add nodes to the graph with their respective attributes from the DataFrame  
    nodes = {}  
    for index, row in df.iterrows():  
        G.add_node(index, pos=(row['latitude'], row['longitude']), Type=row['type'])  # Adjust attributes if needed  
        nodes[index] = len(nodes)  

    # Create edges based on the distance threshold  
    num_houses = len(nodes)  
    node_list = list(nodes.keys())  
    for i in tqdm(range(num_houses), desc="Building graph"):  
        for j in range(i + 1, num_houses):  
            index1 = node_list[i]  
            index2 = node_list[j]  
            if distances[i, j] <= distance_threshold_meters:  
                G.add_edge(index1, index2, weight=distances[i, j])  

    return G

# Example usage:  
# df = pd.DataFrame({...})  # Your DataFrame with required columns  
# graph = create_mashhad_house_graph(df, distance_threshold_meters=500)  
# print(f"Number of edges in the graph: {graph.number_of_edges()}")

In [11]:
# Example usage  
G = create_mashhad_house_graph(df)  
num_edges = G.number_of_edges()  
print(f"Number of edges in the graph: {num_edges}")
for node in list(G.nodes(data=True))[:5]:  # Display the first 5 nodes and their attributes  
    print(node)

Building graph: 100%|██████████| 10000/10000 [00:20<00:00, 476.73it/s]

Number of edges in the graph: 2756904
(0, {'pos': (36.290244424527, 59.507193767606), 'Type': 0.0})
(1, {'pos': (36.36002, 59.5174), 'Type': 0.0})
(2, {'pos': (36.32581, 59.58692), 'Type': 0.0})
(3, {'pos': (36.31845, 59.58768), 'Type': 0.0})
(4, {'pos': (36.30685, 59.57511), 'Type': 1.0})





In [12]:
# nx.write_sparse6(G, "mashhad_graph.s6")

In [13]:
# G = nx.read_sparse6("mashhad_graph.s6")

## Function: Fit and Evaluate a Regression Model

This function takes a regression model and training/testing data as input, fits the model, and evaluates its performance using several metrics. Here's a breakdown:

*   **Model Fitting:** The function first fits the provided `model` to the training data (`X_train`, `y_train`). This step involves the model learning patterns from the training data to make predictions.
*   **Predictions:** It then uses the fitted model to predict the target variable (`y_pred`) for the test data (`X_test`).
*   **Performance Evaluation:** The function calculates several metrics to assess the model's performance:
    *   **R-squared (R2):** Measures the proportion of variance in the target variable explained by the model. Higher R2 values indicate better fit. 
    *   **Mean Absolute Percentage Error (MAPE):** Measures the average percentage difference between the predicted and actual values. Lower MAPE values indicate better accuracy.
    *   **Mean Squared Error (MSE):** Measures the average squared difference between the predicted and actual values. Lower MSE values indicate better accuracy.
    *   **Range Accuracy:** This metric is defined within the function. It calculates the proportion of predictions that fall within a certain tolerance range of the actual values. The tolerance is set by the `tolerance` parameter (default is 0.2, or 20%). 
*   **Visualization:** The function creates a scatterplot of the actual target values (`y_test`) against the predicted values (`y_pred`). This visualization helps to visually assess the model's performance and identify any potential patterns or outliers.

## Function: Random Walk in a Graph

This function performs a random walk on a graph `G`, starting from a given node `start` and continuing for a specified `length`. Here's a breakdown:

*   **Initialization:**
    *   `walk = [str(start)]`: The function initializes a list called `walk` to store the nodes visited during the walk. It starts by adding the `start` node to the list as a string.
*   **Iteration:**
    *   `for i in range(length):`: The function iterates `length` times, representing the steps in the random walk.
        *   `neighbors = [node for node in G.neighbors(start)]`: It gets a list of neighbors of the current node `start` from the graph `G`.
        *   `next_node = np.random.choice(neighbors, 1)`: It randomly selects one of the neighbors as the `next_node` using the `np.random.choice` function.
        *   `walk.append(str(next_node))`: The `next_node` is added to the `walk` list as a string.
        *   `start = next_node`: The `next_node` becomes the new `start` node for the next step in the walk.
*   **Return:**
    *   `return walk`: The function returns the `walk` list, which contains the sequence of nodes visited during the random walk.

In [14]:
def random_walk(start, length):
    walk = [str(start)]  # starting node
    
    for i in range(length):
        neighbors = [node for node in G.neighbors(start)]
        next_node = np.random.choice(neighbors, 1)[0]
        walk.append(str(next_node))
        start = next_node
    
    return walk

In [15]:
nx.is_connected(G)

False

After sampling the dataset, we created a graph of 10,000 properties for the random walk algorithm. To ensure applicability, the graph must be connected, so we identified the largest connected subgraph. The process for extracting this component is described below:

In [16]:
import networkx as nx

# Find the largest connected component
largest_cc = max(nx.connected_components(G), key=len)

# Create a subgraph from the largest connected component
largest_subgraph = G.subgraph(largest_cc).copy()

# If you want to remove the disconnected nodes from the original graph
# G = largest_subgraph
# nx.is_connected(G)
nx.is_connected(largest_subgraph)

True

In [17]:
# Get the nodes in the largest connected component  
nodes_in_largest_cc = set(largest_subgraph.nodes())  

# Use boolean indexing to filter the DataFrame  
df_filtered = df[df.index.isin(nodes_in_largest_cc)] 

In [18]:
# Use the largest connected component graph  
# largest_subgraph = G.subgraph(largest_cc).copy()  

# You can now work with largest_subgraph for graph-related operations, such as:  
print(f"Number of nodes in the largest subgraph: {largest_subgraph.number_of_nodes()}")  
print(f"Number of edges in the largest subgraph: {largest_subgraph.number_of_edges()}")  

# And you can work with df_filtered for DataFrame-related operations:  
print(df_filtered.shape)

Number of nodes in the largest subgraph: 2262
Number of edges in the largest subgraph: 1949364
(2262, 27)


In [19]:
#Verify the number of rows:  
print(f"Original DataFrame shape: {df.shape}")  
print(f"Filtered DataFrame shape: {df_filtered.shape}")  
G.number_of_nodes(), G.number_of_edges(), \
largest_subgraph.number_of_nodes(), largest_subgraph.number_of_edges()

Original DataFrame shape: (10000, 27)
Filtered DataFrame shape: (2262, 27)


(10000, 2756904, 2262, 1949364)

In [20]:
px.scatter(df_filtered , x="longitude", y="latitude",  width=400, height=400)#.write_image("img/spatial_distribution.png")

In [21]:
# df_filtered.head(5)

In [22]:
G.nodes, largest_subgraph.nodes

(NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,

In [23]:
%%time
from tqdm import tqdm  

def generate_random_walks(subgraph, num_walks=80, walk_length=10):  
    """  
    Generate random walks for each node in the given subgraph.  

    Parameters:  
    - subgraph: The graph from which to generate random walks.  
    - num_walks: The number of random walks to generate for each node.  
    - walk_length: The length of each random walk.  

    Returns:  
    - A list of random walks.  
    """  
    print(subgraph.number_of_nodes())
    walks = []  
    for node in tqdm(subgraph.nodes, desc="Nodes"):  
        for _ in range(num_walks):  
            walks.append(random_walk(node, walk_length))  
    return walks  

CPU times: total: 0 ns
Wall time: 0 ns


In [24]:
# # Example usage:  
# # Assuming 'largest_subgraph' is your subgraph, you can call the function like this:  
# num_walks = 80  # Number of walks per node  
# walk_length = 10  # Length of each walk  
# walks = generate_random_walks(largest_subgraph, num_walks, walk_length)  

# # Print the first random walk  
# print(walks[0])

In [25]:
import gzip
import pickle

# # Save walks to a compressed file
# with gzip.open('../walks.pkl.gz', 'wb') as f:
#     pickle.dump(walks, f)


In [6]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error  
from sklearn.model_selection import train_test_split, KFold  
from sklearn.ensemble import GradientBoostingRegressor  
from gensim.models.word2vec import Word2Vec  

def fit_and_evaluate(model, X_train, y_train, X_test, y_test, filename=None, verbose=True, tolerance=0.2):  
    # Fit the model  
    model.fit(X_train, y_train)  

    # Make predictions  
    y_pred = model.predict(X_test)  

    # Metrics  
    r2 = r2_score(y_test, y_pred)  
    mape = mean_absolute_percentage_error(y_test, y_pred)  
    mse = mean_squared_error(y_test, y_pred)  

    # Range accuracy  
    correct_count = np.sum(np.abs(y_test - y_pred) <= tolerance * y_test)  
    accuracy = correct_count / len(y_test)  

    # Scatterplot  
    if verbose:  
        plt.scatter(y_test, y_pred)  
        plt.xlabel('Actual Prices')  
        plt.ylabel('Predicted Prices')  
        plt.title('Actual vs Predicted Prices')  
        if filename:  # Save only if filename is provided  
            plt.savefig(filename, format='png')  
        plt.close()  

    if verbose:  
        print(f"R2 Score: {r2}, MAPE: {mape}, MSE: {mse}, Accuracy: {accuracy}")  

    return r2, mape, accuracy, mse  

def create_word2vec_model(walks, vector_size):  
    model = Word2Vec(walks,  
                     hs=1,   # Hierarchical softmax  
                     sg=1,   # Skip-gram  
                     vector_size=vector_size,  
                     window=5,  
                     workers=4,  
                     seed=1)  
    return model  

def get_embeddings(model, G):  
    return np.array([model.wv[str(i)] for i in G.nodes()])  

def grid_search_embedding_size(df_train, embedding_sizes, random_state=42):  
    best_score = -np.inf  
    best_params = None  

    for size in embedding_sizes:  
        print(f"Evaluating embedding size: {size}")  
        G = create_mashhad_house_graph(df_train)
        walks = generate_random_walks(G)
        # Create Word2Vec model with the given embedding size  
        wv_model = create_word2vec_model(walks, size)  
        # print(G, G.nodes)
        # Get embeddings  
        embeddings = get_embeddings(wv_model, G)  

        # Merge embeddings with training features  
        embeddings_df = pd.DataFrame(embeddings, columns=[f'embedding_{i}' for i in range(embeddings.shape[1])])  
        df_with_embeddings = pd.concat([df_train.reset_index(drop=True), embeddings_df], axis=1)  

        # Prepare for Regression  
        X = df_with_embeddings.drop(['price'], axis=1)  
        y = df_with_embeddings['price']  

        # Use cross-validation to evaluate the model  
        kf = KFold(n_splits=5, shuffle=True, random_state=random_state)  
        scores = []  

        for train_index, val_index in kf.split(X):  
            X_train_kf, X_val_kf = X.iloc[train_index], X.iloc[val_index]  
            y_train_kf, y_val_kf = y.iloc[train_index], y.iloc[val_index]  

            model = GradientBoostingRegressor(loss='huber', n_estimators=100, max_depth=10, random_state=random_state)  
            r2, _, _, _ = fit_and_evaluate(model, X_train_kf, y_train_kf, X_val_kf, y_val_kf, verbose=False)  
            scores.append(r2)  

        mean_r2 = np.mean(scores)  
        print(f"Mean R2 score for embedding size {size}: {mean_r2}")  

        if mean_r2 > best_score:  
            best_score = mean_r2  
            best_params = size  

    print(f"Best embedding size: {best_params} with R2: {best_score}")  
    return best_params  


In [None]:
%%time

# print(df_filtered.shape)  
df_train, df_test = train_test_split(df_filtered, test_size=0.05, random_state=42)  

# Perform grid search on the training set  
embedding_sizes = [2, 3, 5, 10, 20, 30]  # Example sizes to test  
best_embedding_size = grid_search_embedding_size(df_train, embedding_sizes)  
best_embedding_size

Evaluating embedding size: 2


Building graph: 100%|██████████| 2148/2148 [00:04<00:00, 523.65it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [05:25<00:00,  6.59it/s]


Mean R2 score for embedding size 2: 0.8151194977830711
Evaluating embedding size: 3


Building graph: 100%|██████████| 2148/2148 [00:03<00:00, 668.09it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [04:58<00:00,  7.21it/s]


Mean R2 score for embedding size 3: 0.8306642741088277
Evaluating embedding size: 5


Building graph: 100%|██████████| 2148/2148 [00:03<00:00, 619.07it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [05:20<00:00,  6.71it/s]


Mean R2 score for embedding size 5: 0.8191517190613291
Evaluating embedding size: 10


Building graph: 100%|██████████| 2148/2148 [00:03<00:00, 620.60it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [05:15<00:00,  6.81it/s]


Mean R2 score for embedding size 10: 0.8111303568129988
Evaluating embedding size: 20


Building graph: 100%|██████████| 2148/2148 [00:03<00:00, 586.67it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [05:20<00:00,  6.71it/s]


Mean R2 score for embedding size 20: 0.7710333205221941
Evaluating embedding size: 30


Building graph: 100%|██████████| 2148/2148 [00:03<00:00, 644.18it/s] 


2148


Nodes: 100%|██████████| 2148/2148 [05:10<00:00,  6.91it/s]


Mean R2 score for embedding size 30: 0.7817405813830655
Best embedding size: 3 with R2: 0.8306642741088277
CPU times: total: 44min 45s
Wall time: 40min 56s


3

In [27]:
# import pandas as pd  
# from sklearn.model_selection import train_test_split  
# from sklearn.ensemble import GradientBoostingRegressor  

# # Assuming df_filtered is already defined and includes a 'price' column  
# df_train, df_test = train_test_split(df_filtered, test_size=0.05, random_state=42)  

# Store the original indices of the train and test sets  
df_train_indices = df_train.index  
df_test_indices = df_test.index  

# We create embeddings on all train and test data  
G = create_mashhad_house_graph(df_filtered)  
walks = generate_random_walks(G)  
# Create Word2Vec model with the best embedding size  
wv_model = create_word2vec_model(walks, best_embedding_size)  

# Get embeddings from the trained Word2Vec model  
embeddings = get_embeddings(wv_model, G)  

# Create embeddings DataFrame and set the original index to match df_filtered  
embeddings_df = pd.DataFrame(embeddings, index=df_filtered.index, columns=[f'embedding_{i}' for i in range(embeddings.shape[1])])  
# Merge embeddings with the entire DataFrame without resetting the index  
df_with_embeddings = df_filtered.join(embeddings_df)  

# No need to reset indices since we keep the original ones  
# Extract the train and test data with embeddings using the original indices  
train_with_embeddings = df_with_embeddings.loc[df_train_indices]  
test_with_embeddings = df_with_embeddings.loc[df_test_indices]  

# Prepare final features and target variables based on the train_with_embeddings  
X_train_w_embeddings = train_with_embeddings.drop(['price'], axis=1)  
y_train_w_embeddings = train_with_embeddings['price']  

# Set up the test set for final evaluation using the corresponding test set  
X_test_w_embeddings = test_with_embeddings.drop(['price'], axis=1)  
y_test_w_embeddings = test_with_embeddings['price']   

Building graph: 100%|██████████| 2262/2262 [00:03<00:00, 615.32it/s] 


2262


Nodes: 100%|██████████| 2262/2262 [05:13<00:00,  7.21it/s]


In [28]:
# Drop thw last wv columns (word2vec features)
X_train_wo_embeddings = X_train_w_embeddings.iloc[:, :-best_embedding_size]
X_test_wo_embeddings = X_test_w_embeddings.iloc[:, :-best_embedding_size]

In [29]:
# List of variables to save  
data_to_save = {  
    'X_train_wo_embeddings': X_train_wo_embeddings,  
    'X_train_w_embeddings': X_train_w_embeddings,  
    'X_test_wo_embeddings': X_test_wo_embeddings,  
    'X_test_w_embeddings': X_test_w_embeddings,  
    'y_train_w_embeddings': y_train_w_embeddings,  
    'y_test_w_embeddings': y_test_w_embeddings,  
    'best_embedding_size': best_embedding_size,  
}  

# Save the variables to a pickle file  
with open('var/data.pkl', 'wb') as f:  
    pickle.dump(data_to_save, f)  

print("Variables have been saved to var/data.pkl")

Variables have been saved to var/data.pkl


In [4]:
import pickle
# Load the variables from the pickle file  
with open('var/data.pkl', 'rb') as f:  
    loaded_data = pickle.load(f)  

# Unpack the loaded data  
X_train_wo_embeddings = loaded_data['X_train_wo_embeddings']  
X_train_w_embeddings = loaded_data['X_train_w_embeddings']  
X_test_wo_embeddings = loaded_data['X_test_wo_embeddings']  
X_test_w_embeddings = loaded_data['X_test_w_embeddings']  
y_train_w_embeddings = loaded_data['y_train_w_embeddings']  
y_test_w_embeddings = loaded_data['y_test_w_embeddings']  
best_embedding_size = loaded_data['best_embedding_size']  

print("Variables have been loaded successfully!")

Variables have been loaded successfully!


In [7]:

# Fit the model on the entire training set  
model_final = GradientBoostingRegressor(loss='huber', n_estimators=100, max_depth=10, random_state=5)  
r2_with_deepwalk, mape_with_deepwalk, accuracy_with_deepwalk, mse_with_deepwalk = fit_and_evaluate(model_final,\
            X_train_w_embeddings, y_train_w_embeddings, X_test_w_embeddings, y_test_w_embeddings, 'deepwalk_pred_final.png', verbose=True)

R2 Score: 0.8792358717797251, MAPE: 0.09551626279979879, MSE: 5.065530867848121e+17, Accuracy: 0.8859649122807017


In [8]:
print(f"R2 Score: {r2_with_deepwalk:.2f}")
print(f"MAPE: {mape_with_deepwalk:.2f}")
print(f"RMSE: {np.sqrt(mse_with_deepwalk):.2f}")
print(f"Accuracy: {accuracy_with_deepwalk:.2f}")
final_score_deepwalk = (r2_with_deepwalk + (1 - mape_with_deepwalk) + accuracy_with_deepwalk) / 3
print(f'Final score: {final_score_deepwalk:.2f}')

R2 Score: 0.88
MAPE: 0.10
RMSE: 711725429.35
Accuracy: 0.89
Final score: 0.89


## Without Word2Vec

In [9]:
model = GradientBoostingRegressor( loss = 'huber' , n_estimators=100 ,  max_depth=10 , random_state=5 )
r2, mape , accuracy,mse = fit_and_evaluate(model, X_train_wo_embeddings, y_train_w_embeddings, X_test_wo_embeddings, y_test_w_embeddings,'wo-deepwalk.png')

R2 Score: 0.8699285909147727, MAPE: 0.09981387919584465, MSE: 5.45593089153026e+17, Accuracy: 0.8947368421052632


In [10]:
print(f"R2 Score: {r2:.2f}")
print(f"MAPE: {mape:.2f}")
print(f"RMSE: {np.sqrt(mse):.2f}")
print(f"acc: {accuracy:.2f}")
final_score = (r2 + (1 - mape) + accuracy) / 3
print(f'Final score: {final_score:.2f}')

R2 Score: 0.87
MAPE: 0.10
RMSE: 738642734.45
acc: 0.89
Final score: 0.89


In [None]:
(0.11- 0.09)/0.11

0.18181818181818185

In [20]:
(X_train_wo_embeddings.values.shape)

(2148, 26)

In [22]:
import numpy as np  
from scipy import stats  

# این مقادیر از مدل ها به دست آمده اند.  
n_samples = X_train_w_embeddings.shape[0]  #تعداد نمونه های داده ها  
mse_without_deepwalk = mse  # MSE مدل بدون DeepWalk  
# mse_with_deepwalk = 0.8195510061449558 # MSE مدل با DeepWalk  
num_features_without = X_train_wo_embeddings.shape[1] #تعداد ویژگی های مدل بدون DeepWalk  
num_features_with = X_train_w_embeddings.shape[1]    #تعداد ویژگی های مدل با DeepWalk  
num_added_params = num_features_with - num_features_without #تعداد پارامترهای اضافه شده (10)  


# محاسبه درجات آزادی (Degrees of Freedom)  
df_without = n_samples - num_features_without -1 # برای رگرسیون خطی  
df_with = n_samples - num_features_with -1      # برای رگرسیون خطی  

#محاسبه آماره F  
F = ((mse_without_deepwalk - mse_with_deepwalk) / num_added_params) / (mse_with_deepwalk / df_with)  

# محاسبه‌ی p-value  
p_value = 1 - stats.f.cdf(F, num_added_params, df_with)  

print(f"F-statistic: {F:.2f}")  
print(f"P-value: {p_value:.3f}")  

# تفسیر نتایج  
alpha = 0.05  
if p_value < alpha:  
    print("تفاوت بین دو مدل معنی‌دار است (p-value < 0.05).")  
else:  
    print("تفاوت بین دو مدل معنی‌دار نیست (p-value >= 0.05).")  

F-statistic: 54.41
P-value: 0.000
تفاوت بین دو مدل معنی‌دار است (p-value < 0.05).


In [24]:
import pandas as pd  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.svm import SVR  

# Gradient Boosting Regressor  
model_gb = GradientBoostingRegressor(loss='huber', n_estimators=100, max_depth=10, random_state=5)  
r2_gb, mape_gb, accuracy_gb, mse_gb = fit_and_evaluate(model_gb, X_train_wo_embeddings, y_train_w_embeddings,  
                                                         X_test_wo_embeddings, y_test_w_embeddings, 'wo-deepwalk.png')  

r2_gb_deepwalk, mape_gb_deepwalk, accuracy_gb_deepwalk, mse_gb_deepwalk = fit_and_evaluate(model_gb,  
            X_train_w_embeddings, y_train_w_embeddings, X_test_w_embeddings, y_test_w_embeddings,  
            'deepwalk_pred_final.png', verbose=True)  

# Support Vector Regressor (SVR)  
model_svr = SVR(kernel='rbf')  
r2_svr, mape_svr, accuracy_svr, mse_svr = fit_and_evaluate(model_svr, X_train_wo_embeddings, y_train_w_embeddings,  
                                                             X_test_wo_embeddings, y_test_w_embeddings, 'wo-svr.png')  

r2_svr_deepwalk, mape_svr_deepwalk, accuracy_svr_deepwalk, mse_svr_deepwalk = fit_and_evaluate(model_svr,  
            X_train_w_embeddings, y_train_w_embeddings, X_test_w_embeddings, y_test_w_embeddings,  
            'deepwalk_svr_pred_final.png', verbose=True)  

# Random Forest Regressor  
model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=5)  
r2_rf, mape_rf, accuracy_rf, mse_rf = fit_and_evaluate(model_rf, X_train_wo_embeddings, y_train_w_embeddings,  
                                                         X_test_wo_embeddings, y_test_w_embeddings, 'wo-rf.png')  

r2_rf_deepwalk, mape_rf_deepwalk, accuracy_rf_deepwalk, mse_rf_deepwalk = fit_and_evaluate(model_rf,  
            X_train_w_embeddings, y_train_w_embeddings, X_test_w_embeddings, y_test_w_embeddings,  
            'deepwalk_rf_pred_final.png', verbose=True)  

# Create a DataFrame to store results  
results_df = pd.DataFrame({  
    'Model': ['Gradient Boosting', 'Gradient Boosting (DeepWalk)', 'SVR', 'SVR (DeepWalk)', 'Random Forest', 'Random Forest (DeepWalk)'],  
    'R2 Score': [r2_gb, r2_gb_deepwalk, r2_svr, r2_svr_deepwalk, r2_rf, r2_rf_deepwalk],  
    'MAPE': [mape_gb, mape_gb_deepwalk, mape_svr, mape_svr_deepwalk, mape_rf, mape_rf_deepwalk],  
    'Accuracy': [accuracy_gb, accuracy_gb_deepwalk, accuracy_svr, accuracy_svr_deepwalk, accuracy_rf, accuracy_rf_deepwalk],  
    'MSE': [mse_gb, mse_gb_deepwalk, mse_svr, mse_svr_deepwalk, mse_rf, mse_rf_deepwalk]  
})  

# Save the DataFrame to an Excel file  
results_df.to_excel('var/model_results.xlsx', index=False)  

print("Results have been saved to model_results.xlsx")

R2 Score: 0.8699285909147727, MAPE: 0.09981387919584465, MSE: 5.45593089153026e+17, Accuracy: 0.8947368421052632
R2 Score: 0.8792358717797251, MAPE: 0.09551626279979879, MSE: 5.065530867848121e+17, Accuracy: 0.8859649122807017
R2 Score: -0.030967822220895336, MAPE: 0.32391838121439576, MSE: 4.3244624079862446e+18, Accuracy: 0.4473684210526316
R2 Score: -0.0309678229751853, MAPE: 0.3239183814487645, MSE: 4.3244624111501635e+18, Accuracy: 0.4473684210526316
R2 Score: 0.8955686624480639, MAPE: 0.08836123622718511, MSE: 4.380441209951829e+17, Accuracy: 0.9298245614035088
R2 Score: 0.8941550035226185, MAPE: 0.0909502617038732, MSE: 4.439738064315662e+17, Accuracy: 0.9122807017543859
Results have been saved to model_results.xlsx


In [25]:
import pandas as pd  

# Load results from the Excel file  
results_df = pd.read_excel('var/model_results.xlsx')  

# Generate LaTeX longtable format  
latex_table = results_df.to_latex(index=False, escape=False, longtable=True, caption='Results of Regression Models', label='tab:regression_results')  

# Specify the filename for the LaTeX table  
latex_filename = 'var/results_table.tex'  

# Save the LaTeX table to a file  
with open(latex_filename, 'w') as f:  
    f.write(latex_table)  

print(f"LaTeX table has been saved to {latex_filename}")

LaTeX table has been saved to var/results_table.tex


In [26]:
import matplotlib.pyplot as plt  

# Set up the environment for PGF  
import matplotlib as mpl  
mpl.use('pgf')  

# You can set font and other parameters as needed  
mpl.rcParams.update({  
    "pgf.texsystem": "pdflatex",  
    "font.family": "serif",  
    "font.serif": [],  
    "font.size": 12,  
    "pgf.rcfonts": False,  
})  

# Create and save plots  
def plot_results(results_df):  
    metrics = ['R2 Score', 'MAPE', 'Accuracy', 'MSE']  
    models = results_df['Model']  

    for metric in metrics:  
        plt.figure(figsize=(10, 5))  
        plt.bar(models, results_df[metric], color='skyblue')  
        plt.title(f'{metric} of Regression Models')  
        plt.xticks(rotation=45)  
        plt.ylabel(metric)  
        plt.tight_layout()  
        plt.savefig(f'var/{metric.replace(" ", "_").lower()}.pgf')  # Save as PGF files  
        plt.close()  

plot_results(results_df)