In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vietnam-weather-data/weather.csv


In [2]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix
)

import xgboost as xgb
import joblib


In [6]:
import pandas as pd

DATA_PATH = "/kaggle/input/vietnam-weather-data/weather.csv"
df = pd.read_csv(DATA_PATH)


In [7]:
# Cell 3: Basic cleaning & feature/label engineering

print("Columns trong dataset:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

# Map tên cột: dataset dùng lowercase, ta rename sang dạng dễ hiểu
# CHÚ Ý: 'humid' chứ KHÔNG PHẢI 'humidi'
df = df.rename(columns={
    'province': 'Province',
    'max':      'MaxTemp',
    'min':      'MinTemp',
    'rain':     'Rainfall',
    'humidi':    'Humidity',   # <-- sửa ở đây
    'date':     'Date'
})

# Giữ các cột quan trọng
required_cols = ["Province", "Date", "MaxTemp", "MinTemp", "Rainfall", "Humidity"]
missing_cols = [c for c in required_cols if c not in df.columns]
print("\nMissing columns:", missing_cols)
if missing_cols:
    raise ValueError(f"Thiếu cột: {missing_cols}")

# Drop rows thiếu dữ liệu quan trọng
df = df.dropna(subset=required_cols).copy()
print(f"\nSau khi dropna: {len(df)} rows")

# Chuyển Date sang datetime
df["Date"] = pd.to_datetime(df["Date"])

# Tính nhiệt độ trung bình trong ngày
df["TempMean"] = (df["MaxTemp"] + df["MinTemp"]) / 2.0

# Sắp xếp theo Province + Date
df = df.sort_values(["Province", "Date"]).reset_index(drop=True)

# Tạo RainTomorrow = Rainfall ngày hôm sau cùng tỉnh
df["RainTomorrow"] = df.groupby("Province")["Rainfall"].shift(-1)

# Bỏ hàng cuối cùng mỗi tỉnh (RainTomorrow = NaN)
df = df.dropna(subset=["RainTomorrow"]).copy()
print(f"Sau khi tạo RainTomorrow: {len(df)} rows")

# Label heavy_rain_tomorrow: mưa >= 20mm coi là mưa lớn
HEAVY_RAIN_THRESHOLD = 20.0  # mm
df["HeavyRainTomorrow"] = (df["RainTomorrow"] >= HEAVY_RAIN_THRESHOLD).astype(int)

print("\n=== Sample data ===")
print(df[["Province", "Date", "TempMean", "Humidity",
          "Rainfall", "RainTomorrow", "HeavyRainTomorrow"]].head(10))

print("\n=== Label distribution ===")
print(df["HeavyRainTomorrow"].value_counts(normalize=True))


Columns trong dataset: ['province', 'max', 'min', 'wind', 'wind_d', 'rain', 'humidi', 'cloud', 'pressure', 'date']

First few rows:
   province  max  min  wind wind_d  rain  humidi  cloud  pressure        date
0  Bac Lieu   27   22    17    NNE   6.9      90     71      1010  2009-01-01
1  Bac Lieu   31   25    20    ENE   0.0      64     24      1010  2010-01-01
2  Bac Lieu   29   24    14      E   0.0      75     45      1008  2011-01-01
3  Bac Lieu   30   24    30      E   0.0      79     52      1012  2012-01-01
4  Bac Lieu   31   25    20    ENE   0.0      70     24      1010  2013-01-01

Missing columns: []

Sau khi dropna: 181960 rows
Sau khi tạo RainTomorrow: 181920 rows

=== Sample data ===
   Province       Date  TempMean  Humidity  Rainfall  RainTomorrow  \
0  Bac Lieu 2009-01-01      24.5        90       6.9           0.5   
1  Bac Lieu 2009-01-02      25.0        85       0.5          16.7   
2  Bac Lieu 2009-01-03      22.0        91      16.7           2.2   
3  Bac Lieu

In [9]:
# Cell 4: Train/Val/Test split (time-based)

# Sắp xếp theo thời gian (dùng cột "Date" sau khi rename)
df = df.sort_values("Date").reset_index(drop=True)

n = len(df)
test_size = int(n * 0.2)

trainval_df = df.iloc[:-test_size]
test_df     = df.iloc[-test_size:]

print("Train+Val size:", len(trainval_df), "Test size:", len(test_df))

feature_cols_tiny = ["TempMean", "Humidity"]

X_trainval = trainval_df[feature_cols_tiny]
y_trainval = trainval_df["HeavyRainTomorrow"]

X_test = test_df[feature_cols_tiny]
y_test = test_df["HeavyRainTomorrow"]

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval,
    test_size=0.2,
    random_state=42,
    stratify=y_trainval
)

len(X_train), len(X_val), len(X_test)


Train+Val size: 145536 Test size: 36384


(116428, 29108, 36384)

In [10]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

log_reg.fit(X_train, y_train)


In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

def eval_binary_clf(name, model, X_tr, y_tr, X_v, y_v, X_te, y_te):
    print(f"\n==== {name} ====")
    for split_name, X_split, y_split in [
        ("Train", X_tr, y_tr),
        ("Val",   X_v, y_v),
        ("Test",  X_te, y_te),
    ]:
        y_prob = model.predict_proba(X_split)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)

        acc = accuracy_score(y_split, y_pred)
        auc = roc_auc_score(y_split, y_prob)

        print(f"{split_name}: Acc={acc:.3f}, AUC={auc:.3f}")

    print("\n--- Classification report (Test) ---")
    y_prob_test = model.predict_proba(X_te)[:, 1]
    y_pred_test = (y_prob_test >= 0.5).astype(int)
    print(classification_report(y_te, y_pred_test))

    print("\n--- Confusion matrix (Test) ---")
    print(confusion_matrix(y_te, y_pred_test))


eval_binary_clf(
    "Logistic (TempMean + Humidity)",
    log_reg,
    X_train, y_train,
    X_val,   y_val,
    X_test,  y_test
)



==== Logistic (TempMean + Humidity) ====
Train: Acc=0.648, AUC=0.758
Val: Acc=0.651, AUC=0.763
Test: Acc=0.694, AUC=0.771

--- Classification report (Test) ---
              precision    recall  f1-score   support

           0       0.97      0.69      0.81     33894
           1       0.15      0.71      0.24      2490

    accuracy                           0.69     36384
   macro avg       0.56      0.70      0.52     36384
weighted avg       0.91      0.69      0.77     36384


--- Confusion matrix (Test) ---
[[23470 10424]
 [  717  1773]]


In [12]:
coef = log_reg.coef_.reshape(-1)    # [w_TempMean, w_Humidity]
intercept = float(log_reg.intercept_[0])

feature_names = ["TempMean", "Humidity"]
print("Intercept (bias):", intercept)
for name, w in zip(feature_names, coef):
    print(f"Weight for {name}: {w:.6f}")


Intercept (bias): -18.49602386139671
Weight for TempMean: 0.235201
Weight for Humidity: 0.151022
