In [2]:
import requests
from bs4 import BeautifulSoup
import ast

### Web Scraping

In [6]:
page = requests.get('https://webgia.com').text
soup = BeautifulSoup(page, 'html.parser')

artists = soup.find_all('a', string='Biểu đồ giá vàng SJC 1 năm')
for artist in artists:
    names = artist.contents[0]
    fullLink = artist.get('href')
    print(names)
    print(fullLink)

    url = fullLink

<span itemprop="name">Biểu đồ giá vàng SJC 1 năm</span>
https://webgia.com/gia-vang/sjc/bieu-do-1-nam.html


In [7]:
gimme_da_data = requests.get(url)
marrow_soup = BeautifulSoup(gimme_da_data.content, 'html.parser')

scripts = marrow_soup.find_all('script', type='text/javascript')

data = scripts[1].text
i = data.index('data:[')
j = data.index(',tooltip:')
gia_vang = data[i+6:j-1]

gia_vang = ast.literal_eval(gia_vang)

true_gia_vang = []

for i in gia_vang:
    true_gia_vang.append(float(i[1]))

In [8]:
import re
import json
import pandas as pd

# The data is assumed to be stored in a variable `script_text`
script_text = data

# Step 2: Extract the `seriesOptions` data using regex
pattern = r'seriesOptions\s*=\s*(\[\{.*?\}\]);'
match = re.search(pattern, script_text, re.DOTALL)

if match:
    series_data = match.group(1)

    # Further processing to handle JavaScript-style JSON
    # 1. Replace single quotes with double quotes
    series_data = series_data.replace("'", '"')
    
    # 2. Add double quotes around unquoted keys (e.g., tooltip, shadow)
    series_data = re.sub(r'(\b\w+\b):', r'"\1":', series_data)

    # Now attempt to parse the JSON
    try:
        series_data = json.loads(series_data)

        # Step 3: Convert to a DataFrame
        data_frames = []
        for series in series_data:
            name = series['name']
            data = series['data']
            # Create a DataFrame from the data
            df = pd.DataFrame(data, columns=['Timestamp', 'Price'])
            df['Type'] = name  # Add a column for the type of data (e.g., 'Bán ra' or 'Mua vào')
            data_frames.append(df)

        # Combine both 'Bán ra' and 'Mua vào' data into a single DataFrame
        df = pd.concat(data_frames, ignore_index=True)

        # Convert the timestamp from milliseconds to a readable date format
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')

        # Display the final data table
        print(df)
    except json.JSONDecodeError as e:
        print("Error parsing JSON:", e)
else:
    print("No data found in the script text.")


              Timestamp  Price     Type
0   2023-11-13 17:00:00  70.30   Bán ra
1   2023-11-14 17:00:00  70.50   Bán ra
2   2023-11-15 17:00:00  70.35   Bán ra
3   2023-11-16 17:00:00  70.40   Bán ra
4   2023-11-17 17:00:00  70.60   Bán ra
..                  ...    ...      ...
559 2024-11-06 17:00:00  87.00  Mua vào
560 2024-11-07 17:00:00  85.50  Mua vào
561 2024-11-08 17:00:00  82.00  Mua vào
562 2024-11-09 17:00:00  82.00  Mua vào
563 2024-11-11 17:00:00  82.00  Mua vào

[564 rows x 3 columns]


### Data Preprocessing

In [9]:
df.head()

Unnamed: 0,Timestamp,Price,Type
0,2023-11-13 17:00:00,70.3,Bán ra
1,2023-11-14 17:00:00,70.5,Bán ra
2,2023-11-15 17:00:00,70.35,Bán ra
3,2023-11-16 17:00:00,70.4,Bán ra
4,2023-11-17 17:00:00,70.6,Bán ra


In [10]:
print('types of data:')
print(df.value_counts('Type'))
print('---------------------------------------------------')
print('null values:')
print(df.isnull().sum())
print('---------------------------------------------------')
print('data types:')
print(df.dtypes)

types of data:
Type
Bán ra     282
Mua vào    282
Name: count, dtype: int64
---------------------------------------------------
null values:
Timestamp    0
Price        0
Type         0
dtype: int64
---------------------------------------------------
data types:
Timestamp    datetime64[ns]
Price               float64
Type                 object
dtype: object


In [11]:
# Convert the Timestamp to datetime if it isn't already
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='ms')

# Display only the date part
df['Timestamp'] = df['Timestamp'].dt.date

# Display the DataFrame
print(df)

      Timestamp  Price     Type
0    2023-11-13  70.30   Bán ra
1    2023-11-14  70.50   Bán ra
2    2023-11-15  70.35   Bán ra
3    2023-11-16  70.40   Bán ra
4    2023-11-17  70.60   Bán ra
..          ...    ...      ...
559  2024-11-06  87.00  Mua vào
560  2024-11-07  85.50  Mua vào
561  2024-11-08  82.00  Mua vào
562  2024-11-09  82.00  Mua vào
563  2024-11-11  82.00  Mua vào

[564 rows x 3 columns]


In [12]:
dummy_variable = df['Type'].str.get_dummies()
dummy_variable.rename(columns={'Bán ra': 'Selling', 'Mua vào': 'Buying'}, inplace=True)
df = pd.concat([df, dummy_variable], axis=1)
df.drop('Type', axis=1, inplace=True)
df.head()

Unnamed: 0,Timestamp,Price,Selling,Buying
0,2023-11-13,70.3,1,0
1,2023-11-14,70.5,1,0
2,2023-11-15,70.35,1,0
3,2023-11-16,70.4,1,0
4,2023-11-17,70.6,1,0


### Model Training

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming `final_df` is the DataFrame you've already processed
# Convert the Timestamp to ordinal date format for regression models
df['Date_Ordinal'] = pd.to_datetime(df['Timestamp']).map(pd.Timestamp.toordinal)

# Define features (X) and target (y)
X = df[['Date_Ordinal', 'Selling', 'Buying']]
y = df['Price']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set the degree of the polynomial features
degree = 2  # You can experiment with different degrees

# Create the polynomial regression pipeline
polynomial_regression = Pipeline([
    ("polynomial_features", PolynomialFeatures(degree=degree)),
    ("linear_regression", LinearRegression())
])

# Fit the model
polynomial_regression.fit(X_train, y_train)

# Make predictions on the test set
y_pred = polynomial_regression.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance on Test Set:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

# Perform cross-validation to assess model stability
cv_scores = cross_val_score(polynomial_regression, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)

print(f"Cross-Validation RMSE: {cv_rmse.mean()} ± {cv_rmse.std()}")


Model Performance on Test Set:
Mean Absolute Error (MAE): 2.6098371742494337
Root Mean Squared Error (RMSE): 3.5075522438258546
R-squared (R²): 0.4835012491909094
Cross-Validation RMSE: 5.944261653390738 ± 1.3260309412734554


In [14]:
# Example of predicting the gold price for a future date
from datetime import datetime

# Replace 'YYYY-MM-DD' with the actual date you want to predict
new_date = '2024-11-12'
date_ordinal = pd.Timestamp(new_date).toordinal()
new_data = pd.DataFrame({
    'Date_Ordinal': [date_ordinal],
    'Selling': [1],  # Adjust based on your feature
    'Buying': [0]     # Adjust based on your feature
})

predicted_price = polynomial_regression.predict(new_data)
print(f"Predicted Gold Price on {new_date}: {predicted_price[0]}")

Predicted Gold Price on 2024-11-12: 83.44285611063242
