# Prediccion de precipitaciones

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

## Preparar los datos
Se trabaja solo con barcelona

In [2]:
data = pd.read_csv('city_temperature.csv', header='infer',encoding='latin1')

ciudades = ["Barcelona"]
data_filtered = data[data['City'].isin(ciudades)]
data_filtered = data_filtered[['Day', 'Month', 'Year', 'AvgTemperature']]



  data = pd.read_csv('city_temperature.csv', header='infer',encoding='latin1')


## Sin procesamiento

### Lineal

In [3]:
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered["AvgTemperature"]

# Do the regression as usual.
lm = LinearRegression()
lm.fit(X,Y)
rsq2 = lm.score(X,Y)
rmse2 = np.sqrt(metrics.mean_squared_error(Y, lm.predict(X)))
print("RSQ2: " + str(rsq2))

RSQ2: 0.04276480453250264


### Random forest

In [4]:
# split the data into training and testing sets
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered['AvgTemperature']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create a Random Forest regressor and fit the training data
rf = RandomForestRegressor(n_estimators=150, random_state=42)
rf.fit(X_train, Y_train)

# predict on the test data and calculate the evaluation metrics
y_pred = rf.predict(X_test)
rsq = rf.score(X_test, Y_test)
rmse = np.sqrt(metrics.mean_squared_error(Y_test, y_pred))
print("RSQ: " + str(rsq))
print("RMSE: " + str(rmse))

RSQ: 0.517665615267938
RMSE: 10.408211898813382


## Columnas normalizadas

In [5]:
scaler = MinMaxScaler()
scaled_data = data_filtered.copy()
scaled_data.Day = scaler.fit_transform(data_filtered[['Day']])
scaled_data.Month = scaler.fit_transform(data_filtered[['Month']])
scaled_data.Year = scaler.fit_transform(data_filtered[['Year']])

### Lineal

In [6]:
X = scaled_data[['Day', 'Month', 'Year']]
Y = scaled_data["AvgTemperature"]

# Do the regression as usual.
lm = LinearRegression()
lm.fit(X,Y)
rsq2 = lm.score(X,Y)
rmse2 = np.sqrt(metrics.mean_squared_error(Y, lm.predict(X)))
print("RSQ2: " + str(rsq2))

RSQ2: 0.04276480453250264


### Random forest

In [7]:
# split the data into training and testing sets
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered['AvgTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create a Random Forest regressor and fit the training data
rf = RandomForestRegressor(n_estimators=150, random_state=42)
rf.fit(X_train, y_train)

# predict on the test data and calculate the evaluation metrics
y_pred = rf.predict(X_test)
rsq = rf.score(X_test, y_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RSQ: " + str(rsq))
print("RMSE: " + str(rmse))


RSQ: 0.517665615267938
RMSE: 10.408211898813382


## Random forest K optimizado

In [8]:
# Parameter grid.
k_grid = np.arange(80, 120, 1)
parameters = {'n_estimators':k_grid}

In [9]:
# split the data into training and testing sets
X = scaled_data[['Day', 'Month', 'Year']]
Y = scaled_data['AvgTemperature']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Optimize the k.
gridCV = GridSearchCV(RandomForestRegressor(), parameters, cv=10, n_jobs = -1)       # "n_jobs = -1" means "use all the CPU cores".
gridCV.fit(X_train, Y_train)
best_k = gridCV.best_params_['n_estimators']
print("Best k : " + str(best_k))

Best k : 80


In [10]:
# split the data into training and testing sets
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered['AvgTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create a Random Forest regressor and fit the training data
rf = RandomForestRegressor(n_estimators=best_k, random_state=42)
rf.fit(X_train, y_train)

# predict on the test data and calculate the evaluation metrics
y_pred = rf.predict(X_test)
rsq = rf.score(X_test, y_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RSQ: " + str(rsq))
print("RMSE: " + str(rmse))

RSQ: 0.5185758379188519
RMSE: 10.398386491075712


## Sin el dia

In [11]:
# split the data into training and testing sets
X = scaled_data[['Month', 'Year']]
Y = scaled_data['AvgTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create a Random Forest regressor and fit the training data
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# predict on the test data and calculate the evaluation metrics
y_pred = rf.predict(X_test)
rsq = rf.score(X_test, y_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RSQ: " + str(rsq))
print("RMSE: " + str(rmse))

RSQ: 0.4780520726710181
RMSE: 10.827186122339652


## Outlayers

In [12]:
data_filtered['AvgTemperature'] = data_filtered['AvgTemperature'].replace(-99, float('NaN'))
data_filtered = data_filtered.dropna(subset=['AvgTemperature'])

#### Lineal

In [13]:
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered["AvgTemperature"]

# Do the regression as usual.
lm = LinearRegression()
lm.fit(X,Y)
rsq2 = lm.score(X,Y)
rmse2 = np.sqrt(metrics.mean_squared_error(Y, lm.predict(X)))
print("RSQ2: " + str(rsq2))

RSQ2: 0.08098331470517284


#### Random Forest

In [14]:
# split the data into training and testing sets
X = data_filtered[['Day', 'Month', 'Year']]
Y = data_filtered['AvgTemperature']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# create a Random Forest regressor and fit the training data
rf = RandomForestRegressor(n_estimators=112, random_state=42)
rf.fit(X_train, y_train)

# predict on the test data and calculate the evaluation metrics
y_pred = rf.predict(X_test)
rsq = rf.score(X_test, y_test)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RSQ: " + str(rsq))
print("RMSE: " + str(rmse))

RSQ: 0.9536664752794375
RMSE: 2.2822734618103433


## Prueba real
Hoy 4 de mayo de 2023 hay una maxima de 23º y una minima de 13º, una media d 18º

In [16]:
# Crear una fila con los valores de las variables
new_data_dict = {
    'Day': 4,
    'Month': 5,
    'Year': 2023
}


# Convertir el diccionario en un dataframe
new_data = pd.DataFrame([new_data_dict])
print(f"Temperatura: {( rf.predict(new_data)[0] - 32) * 5 / 9}")

Temperatura: 19.28224206349212
