In [1]:
import numpy as np
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler  
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Read HDFS Weather Data") \
    .master("local") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

In [3]:


def data_preprocess(location, target_variable='all'):
   
    # Read data from HDFS
    df = spark.read.option("multiLine", True) \
        .option("header", True) \
        .option("inferSchema", False) \
        .option("encoding", "utf-8") \
        .csv(f"hdfs://namenode:9000/tmp/weather_data/history/{location}.csv")
    df = df.toPandas()
    
    df['Temp'] = df['Temp'].str.replace('°c', '').str.strip()
    df['Rain'] = df['Rain'].str.replace('\nmm', '').str.strip()
    df['Cloud'] = df['Cloud'].str.replace('%', '').str.strip()
    df['Pressure'] = df['Pressure'].str.replace('mb', '').str.strip()
    df['Wind'] = df['Wind'].str.replace('km/h', '').str.strip()
    df['Gust'] = df['Gust'].str.replace('km/h', '').str.strip()
    
    df = df.astype({
        'Date': 'datetime64[ns]',
        'Temp': 'float64',
        'Rain': 'float64',
        'Cloud': 'float64',
        'Pressure': 'float64',
        'Wind': 'float64',
        'Gust': 'float64'
    })
    
    df = df.rename(columns={
        'Temp': 'temp',
        'Rain': 'rain',
        'Cloud': 'cloud',
        'Pressure': 'pressure',
        'Wind': 'windspeed',
        'Gust': 'Gust'
    })
    
    weather_type1 = ['Sunny', 'Clear', 'Partly cloudy']
    weather_type2 = ['Overcast', 'Cloudy', 'Patchy rain possible', 'Light drizzle', 'Light rain shower', 'Patchy light rain with thunder']
    weather_type3 = ['Heavy rain at times', 'Moderate or heavy rain shower', 'Moderate rain at times', 'Moderate rain']
    
    conditions = [
        df['Weather'].isin(weather_type1),
        df['Weather'].isin(weather_type2),
        df['Weather'].isin(weather_type3)
    ]
    choices = [0, 1, 2]
    df['Weather'] = np.select(conditions, choices, default=0)
    
    
    lag_steps = 3
    for lag in range(1, lag_steps + 1):
        df[f'temp_lag_{lag}'] = df['temp'].shift(lag)
        df[f'rain_lag_{lag}'] = df['rain'].shift(lag)
        df[f'cloud_lag_{lag}'] = df['cloud'].shift(lag)
        df[f'pressure_lag_{lag}'] = df['pressure'].shift(lag)
        df[f'windspeed_lag_{lag}'] = df['windspeed'].shift(lag)
        df[f'Gust_lag_{lag}'] = df['Gust'].shift(lag)
    
    # Save original values for future targets
    future_temp = df['temp'].iloc[lag_steps + 1:].reset_index(drop=True)
    future_cloud = df['cloud'].iloc[lag_steps + 1:].reset_index(drop=True)
    future_weather = df['Weather'].iloc[lag_steps + 1:].reset_index(drop=True)
    
    # Drop date/time columns
    df.drop(columns=['Date', 'Time'], inplace=True)
    
    # Create feature set (all columns except target variables)
    X = df.iloc[lag_steps:-1].reset_index(drop=True)
    
    result = {}
    
    # Prepare datasets for each target variable
    if target_variable in ['all', 'weather']:
        X_weather = X.copy()
        X_weather['Weather'] = future_weather
        result['weather'] = X_weather
    
    if target_variable in ['all', 'temperature']:
        X_temp = X.copy()
        X_temp['Future_Temp'] = future_temp
        result['temperature'] = X_temp
    
    if target_variable in ['all', 'cloud']:
        X_cloud = X.copy()
        X_cloud['Future_Cloud'] = future_cloud
        result['cloud'] = X_cloud
    
    return result


In [4]:
def build_weather_prediction_model(provinces):
    """
    Build models to predict weather conditions
    """
    df_pd = pd.DataFrame()
    for location in provinces:
        data_dict = data_preprocess(location, 'weather')
        df_cur = data_dict['weather']
        df_pd = pd.concat([df_pd, df_cur], ignore_index=True)
    
    df = spark.createDataFrame(df_pd)
    
    feature_cols = [col for col in df_pd.columns if col != 'Weather']
    
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_assembled")
    scaler = StandardScaler(inputCol="features_assembled", outputCol="features")
    
    # Random Forest classifier for weather condition prediction
    rf = RandomForestClassifier(labelCol="Weather", featuresCol="features")
    
    # Define pipeline
    pipeline = Pipeline(stages=[assembler, scaler, rf])
    
    # Grid search for hyperparameter tuning
    paramGrid = (ParamGridBuilder()
        .addGrid(rf.numTrees, [50])
        .addGrid(rf.maxDepth, [5])
        .addGrid(rf.minInstancesPerNode, [2])
        .build())
    
    # Evaluation & cross validation
    evaluator = MulticlassClassificationEvaluator(labelCol="Weather", predictionCol="prediction", metricName="accuracy")
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)
    
    # Split train/test
    train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
    
    # Fit model
    cv_model = cv.fit(train_data)
    
    # Evaluate model
    predictions = cv_model.transform(test_data)
    accuracy = evaluator.evaluate(predictions)
    print(f"Weather Prediction Accuracy = {accuracy}")
    
    return cv_model, accuracy

In [5]:
def build_temperature_prediction_model(provinces):
    """
    Build model to predict future temperature
    """
    df_pd = pd.DataFrame()
    for location in provinces:
        data_dict = data_preprocess(location, 'temperature')
        df_cur = data_dict['temperature']
        df_pd = pd.concat([df_pd, df_cur], ignore_index=True)
    
    # Convert pandas DataFrame to Spark DataFrame
    df = spark.createDataFrame(df_pd)
    
    # Define feature columns (all except 'Future_Temp')
    feature_cols = [col for col in df_pd.columns if col != 'Future_Temp']
    
    # Create feature vector
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_assembled")
    scaler = StandardScaler(inputCol="features_assembled", outputCol="features")
    
    # Random Forest regressor for temperature prediction
    rf = RandomForestRegressor(labelCol="Future_Temp", featuresCol="features")
    
    # Define pipeline
    pipeline = Pipeline(stages=[assembler, scaler, rf])
    
    # Grid search for hyperparameter tuning
    paramGrid = (ParamGridBuilder()
        .addGrid(rf.numTrees, [50])
        .addGrid(rf.maxDepth, [5])
        .addGrid(rf.minInstancesPerNode, [2])
        .build())
    
    # Evaluation & cross validation
    evaluator = RegressionEvaluator(labelCol="Future_Temp", predictionCol="prediction", metricName="rmse")
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)
    
    # Split train/test
    train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
    
    # Fit model
    cv_model = cv.fit(train_data)
    
    # Evaluate model
    predictions = cv_model.transform(test_data)
    rmse = evaluator.evaluate(predictions)
    print(f"Temperature Prediction RMSE = {rmse}")
    
    return cv_model, rmse

In [6]:
def build_cloud_prediction_model(provinces):
    """
    Build model to predict future cloud cover
    """
    df_pd = pd.DataFrame()
    for location in provinces:
        data_dict = data_preprocess(location, 'cloud')
        df_cur = data_dict['cloud']
        df_pd = pd.concat([df_pd, df_cur], ignore_index=True)
    
    # Convert pandas DataFrame to Spark DataFrame
    df = spark.createDataFrame(df_pd)
    
    # Define feature columns (all except 'Future_Cloud')
    feature_cols = [col for col in df_pd.columns if col != 'Future_Cloud']
    
    # Create feature vector
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_assembled")
    scaler = StandardScaler(inputCol="features_assembled", outputCol="features")
    
    # Random Forest regressor for cloud cover prediction
    rf = RandomForestRegressor(labelCol="Future_Cloud", featuresCol="features")
    
    # Define pipeline
    pipeline = Pipeline(stages=[assembler, scaler, rf])
    
    # Grid search for hyperparameter tuning
    paramGrid = (ParamGridBuilder()
        .addGrid(rf.numTrees, [50])
        .addGrid(rf.maxDepth, [5])
        .addGrid(rf.minInstancesPerNode, [2])
        .build())
    
    # Evaluation & cross validation
    evaluator = RegressionEvaluator(labelCol="Future_Cloud", predictionCol="prediction", metricName="rmse")
    cv = CrossValidator(estimator=pipeline,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=5)
    
    # Split train/test
    train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)
    
    # Fit model
    cv_model = cv.fit(train_data)
    
    # Evaluate model
    predictions = cv_model.transform(test_data)
    rmse = evaluator.evaluate(predictions)
    print(f"Cloud Cover Prediction RMSE = {rmse}")
    
    return cv_model, rmse

In [7]:
def build_combined_weather_models(provinces):
    
    weather_model, weather_accuracy = build_weather_prediction_model(provinces)
    temp_model, temp_rmse = build_temperature_prediction_model(provinces)
    cloud_model, cloud_rmse = build_cloud_prediction_model(provinces)

    print("Model Performance Summary:")
    print(f"Weather Condition Prediction Accuracy: {weather_accuracy:.4f}")  
    print(f"Temperature Prediction RMSE: {temp_rmse:.4f}°C")
    print(f"Cloud Cover Prediction RMSE: {cloud_rmse:.4f}%")
    
    return {
        "weather_model": weather_model,
        "temperature_model": temp_model,
        "cloud_model": cloud_model
    }


if __name__ == "__main__":
    provinces = ["ha-noi", "ho-chi-minh-city", "vinh"]  # Add your actual province names
    models = build_combined_weather_models(provinces)

Weather Prediction Accuracy = 0.7251527939464494
Temperature Prediction RMSE = 1.9277850996300645
Cloud Cover Prediction RMSE = 19.25699001619655
Model Performance Summary:
Weather Condition Prediction Accuracy: 0.7252
Temperature Prediction RMSE: 1.9278°C
Cloud Cover Prediction RMSE: 19.2570%


In [8]:
model_path_org = "hdfs://namenode:9000/model/rf_model"

for name,model in models.items():
    model_path = f"{model_path_org}/{name}"
    best_model=model.bestModel
    best_model.save(model_path)
    print(f"Đã lưu {name}")

Đã lưu weather_model
Đã lưu temperature_model
Đã lưu cloud_model


In [19]:
model_path_local = "model"

for name,model in models.items():
    model_path = f"{model_path_local}/{name}"
    best_model=model.bestModel
    best_model.save(model_path)
    print(f"Đã lưu {name}")

Đã lưu weather_model
Đã lưu temperature_model
Đã lưu cloud_model


In [20]:
import os

model_path_local = "/home/jovyan/notebooks/model"  # hoặc đường dẫn tuyệt đối khác trong container
os.makedirs(model_path_local, exist_ok=True)

for name, model in models.items():
    model_path = f"file://{model_path_local}/{name}"  # Lưu ý: file://
    best_model = model.bestModel
    best_model.save(model_path)
    print(f"Đã lưu {name} vào {model_path}")


Đã lưu weather_model vào file:///home/jovyan/notebooks/model/weather_model
Đã lưu temperature_model vào file:///home/jovyan/notebooks/model/temperature_model
Đã lưu cloud_model vào file:///home/jovyan/notebooks/model/cloud_model
