In [4]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
import pandas as pd
import numpy as np

In [None]:
spark = SparkSession.builder \
    .appName("Read HDFS Weather Data") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()

In [None]:
def data_preprocess(location):
    df = spark.read.option("multiLine", True) \
        .option("header", True) \
        .option("inferSchema", False) \
        .option("encoding", "utf-8") \
        .csv(f"hdfs://namenode:9000/tmp/weather_data/history/{location}.csv")

    df = df.toPandas()
    
    # Xóa các số đo, chỉ lấy giá trị
    df['Temp'] = df['Temp'].str.replace('°c', '').str.strip()
    df['Rain'] = df['Rain'].str.replace('\nmm', '').str.strip()
    df['Cloud'] = df['Cloud'].str.replace('%', '').str.strip()
    df['Pressure'] = df['Pressure'].str.replace('mb', '').str.strip()
    df['Wind'] = df['Wind'].str.replace('km/h', '').str.strip()
    df['Gust'] = df['Gust'].str.replace('km/h', '').str.strip()

    df = df.astype({
        'Date': 'datetime64[ns]',
        'Temp': 'float64',
        'Rain': 'float64',
        'Cloud': 'float64',
        'Pressure': 'float64',
        'Wind': 'float64',
        'Gust': 'float64'
    })

    df = df.rename(columns={
        'Temp': 'Temp(°c)',
        'Rain': 'Rain(nmm)',
        'Cloud': 'Cloud(%)',
        'Pressure': 'Pressure(mb)',
        'Wind': 'Wind(km/h)',
        'Gust': 'Gust(km/h)'
    })

    #Nhóm dữ liệu
    weather_type1 = ['Sunny', 'Clear', 'Partly cloudy']
    weather_type2 = ['Overcast', 'Cloudy', 'Patchy rain possible', 'Light drizzle', 'Light rain shower', 'Patchy light rain with thunder']
    weather_type3 = ['Heavy rain at times', 'Moderate or heavy rain shower', 'Moderate rain at times', 'Moderate rain']

    # Áp dụng số hóa cho cột 'Weather'
    conditions = [
        df['Weather'].isin(weather_type1),
        df['Weather'].isin(weather_type2),
        df['Weather'].isin(weather_type3)
    ]
    choices = [0, 1, 2]

    df['Weather'] = np.select(conditions, choices, default=0)

    #Lấy thông tin 6 mốc gần nhất để thêm dữ kiện
    lag_steps = 6
    for lag in range(1, lag_steps + 1):
        df[f'Temp_t-{lag}'] = df['Temp(°c)'].shift(lag)
        df[f'Rain_t-{lag}'] = df['Rain(nmm)'].shift(lag)
        df[f'Cloud_t-{lag}'] = df['Cloud(%)'].shift(lag)
        df[f'Pressure_t-{lag}'] = df['Pressure(mb)'].shift(lag)
        df[f'Wind_t-{lag}'] = df['Wind(km/h)'].shift(lag)
        df[f'Gust_t-{lag}'] = df['Gust(km/h)'].shift(lag)

    df.drop(columns=['Date', 'Time'], inplace=True)

    # Bỏ qua các hàng có giá trị NaN
    X = df.drop(columns=['Weather'])
    Y = df['Weather']
    X = X[lag_steps:-1].reset_index(drop=True)
    Y = Y[lag_steps + 1:].reset_index(drop=True)
    X['Weather'] = Y

    return X

In [None]:
#Xây dựng và huấn luyện mô hình MLP
def build_mlp_model(provinces):
    df_pd = pd.DataFrame()
    for location in provinces:
        df_cur = data_preprocess(location)
        df_pd = pd.concat([df_pd, df_cur], ignore_index= True)
    
    # Chuyển Pandas -> Spark DataFrame
    df_spark = spark.createDataFrame(df_pd)

    feature_cols = [c for c in df_pd.columns if c != 'Weather']
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_assembled')
    scaler = StandardScaler(inputCol='features_assembled', outputCol='features')

    # Định nghĩa cấu trúc lớp: input + 2 hidden + output
    input_dim = len(feature_cols)
    output_dim = df_pd['Weather'].nunique()  # 4 nếu có label 0
    layers = [input_dim, 100, 50, output_dim]

    mlp = MultilayerPerceptronClassifier(
        labelCol='Weather',
        featuresCol='features',
        maxIter= 100,
        layers=layers,
        blockSize=128,
        seed=42
    )

    pipeline = Pipeline(stages=[assembler, scaler, mlp])

    # Grid search tham số
    paramGrid = (ParamGridBuilder()
        .addGrid(mlp.maxIter, [100])
        .addGrid(mlp.stepSize, [0.01])
        .build())

    evaluator = MulticlassClassificationEvaluator(
        labelCol='Weather',
        predictionCol='prediction',
        metricName='accuracy'
    )

    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator,
        numFolds=3
    )

    # Chia train/test 80/20
    train_df, test_df = df_spark.randomSplit([0.8, 0.2], seed=42)

    cv_model = cv.fit(train_df)
    preds = cv_model.transform(test_df)
    acc = evaluator.evaluate(preds)
    print(f"MLP Classifier Accuracy = {acc}")

    return cv_model, acc

In [None]:
northern_provinces = [
    'bac-can', 'bac-giang', 'bac-ninh', 
    'dien-bien', 'ha-giang', 'ha-noi', 'hai-duong', 
    'hai-phong', 'hoa-binh', 'hong-gai', 
    'lang-son', 'lao-cai', 'nam-dinh', 'ninh-binh', 
    'phu-ly', 'son-la', 'son-tay', 'thai-binh', 
    'thai-nguyen', 'tuyen-quang', 'uong-bi', 'viet-tri', 
    'vinh-yen'
]

central_provinces = [
    'da-lat', 'dong-hoi', 'ha-tinh', 'hoi-an', 
    'hue', 'kon-tum', 'nha-trang', 'phan-rang', 
    'phan-thiet', 'play-cu', 'quang-ngai', 'qui-nhon', 
    'tam-ky', 'thanh-hoa', 'tuy-hoa', 'vinh', 
    'buon-me-thuot', 'cam-ranh'
]

southern_provinces = [
    'bac-lieu', 'ben-tre', 'bien-hoa', 'ca-mau', 
    'chau-doc', 'dong-xoai', 'ho-chi-minh-city', 
    'long-xuyen', 'my-tho', 'rach-gia', 'soc-trang', 
    'tan-an', 'tay-ninh', 'tra-vinh', 'vinh-long', 
    'vung-tau'
]

MLP Classifier Accuracy = 0.7668228036101908
Finished MLP training. Accuracy: 0.7668228036101908


In [None]:
list_provinces = ['vinh','dong-hoi']

mlp_model, mlp_acc = build_mlp_model(list_provinces)
print("Finished MLP training. Accuracy:", mlp_acc)
# mlp_model.save("hdfs://namenode:9000/tmp/models/mlp_model")

In [None]:
# lưu model
#model_path = "path"
#lr_model.write().overwrite().save(model_path)

In [None]:
# load lại model
#from pyspark.ml.tuning import CrossValidatorModel
#loaded_model = CrossValidatorModel.load("path")