In [41]:
import pandas as pd
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

In [42]:
#load data
df=pd.read_csv("/home/hungday/Downloads/paris_temperature.csv")

## data Preprocessing


In [43]:
#change datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df['sunrise'] = pd.to_datetime(df['sunrise'], errors='coerce')
df['sunset'] = pd.to_datetime(df['sunset'], errors='coerce')

In [44]:
#find categorical features
categorical_features = df.select_dtypes(include=['object', 'category']).columns
print("Categorical features:", categorical_features.tolist())

Categorical features: ['name', 'preciptype', 'conditions', 'description', 'icon', 'stations']


In [45]:
#drop 3 columns that are not used
df.drop(columns=['description', 'stations', 'name','sunrise','sunset'], inplace=True)

In [46]:
#number of cells contain Nan in preciptype
#Nan_prec=df['preciptype'].isna().sum() "74 Nan cells"
#fill Nan with mode of the column
df['preciptype'].fillna("no_rain", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['preciptype'].fillna("no_rain", inplace=True)


In [47]:
#encoder 
label_encoders=LabelEncoder()
df['preciptype']=label_encoders.fit_transform(df['preciptype'])
df['conditions']=label_encoders.fit_transform(df['conditions'])
df['icon']=label_encoders.fit_transform(df['icon'])

## Training model with categorical features

In [48]:
#start in training data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error 

In [49]:
#split data
x=df.drop('temp',axis=1)
y=df['temp']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# #scale Data
# scaler=StandardScaler()
# x_train_scaled=scaler.fit_transform(x_train)
# x_test_scaled=scaler.fit_transform(x_test)

In [50]:
#scale Data
x_train_time=x_train['datetime']
x_test_time=x_test['datetime']
x_train.drop('datetime',axis=1,inplace=True)
x_test.drop('datetime',axis=1,inplace=True)
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.fit_transform(x_test)

# linear regression

In [51]:
#linear reg
linear=LinearRegression()
param={
    'fit_intercept': [True, False]
}
grid_linear=GridSearchCV(estimator=linear, param_grid=param,cv=5, scoring='neg_mean_squared_error')

grid_linear.fit(x_train,y_train)
best_model=grid_linear.best_estimator_
y_pred = best_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) 
print('RMSE:', mse)
print('R2:', r2)
print('MAPE:', mape)

RMSE: 0.04656648208956753
R2: 0.9985648032578996
MAPE: 0.0403075800161505


# lgb

In [52]:
print(x_train.shape)
print(y_train.shape)

(305, 26)
(305,)


In [53]:
from sklearn.model_selection import KFold
param_grid = {
    'num_leaves': [31, 127],
    'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
    'lambda_l1': [0, 1, 1.5],
    'lambda_l2': [0, 1]
    }

lgb_model = lgb.LGBMRegressor()
grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1  # Use all available cores
)
grid_search.fit(x_train, y_train)
best_model=grid_search.best_estimator_

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.134155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1215
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 23
[LightGBM] [Info] Start training from score 13.440984
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.185125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1223
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.194753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1215
[LightGBM] [Info] Start training from score 13.652869
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 23
[LightGBM] [Info] Auto-choosing

In [54]:
#calculate metrics
y_pred=best_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) 
print('RMSE:', mse)
print('R2:', r2)
print('MAPE:', mape)

RMSE: 0.5525300960663776
R2: 0.9829708116610228
MAPE: 0.19172382730724888


# XGboost


In [55]:
model=xgb.XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',  # Regression-specific metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1
)
grid_search.fit(x_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [56]:
#calculate metrics
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred) 
print('RMSE:', mse)
print('R2:', r2)
print('MAPE:', mape)

RMSE: 0.08648318944681174
R2: 0.997334555109794
MAPE: 0.06263095742883001
