In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load data
raw_data = pd.read_csv("Tractores_Matriculados.csv")
complete_data = raw_data.copy()
print(raw_data.head(), "\n")  # check data structure
print(raw_data.dtypes)  # check data types

        Fecha  Tractores
0  1986-01-31     1965.0
1  1986-02-28     1467.0
2  1986-03-31     1106.0
3  1986-04-30     1460.0
4  1986-05-31     1563.0 

Fecha         object
Tractores    float64
dtype: object


In [3]:
raw_data.Fecha.describe()
pd.to_datetime(raw_data.Fecha, yearfirst=True)  # convert to datetime format
print(raw_data.dtypes, "\n")

AttributeError: module 'pandas' has no attribute 'to_datetbime'

### Statistics

In [None]:
print(raw_data.describe(), "\n")  # summary statistics
print(raw_data.isna())  # check for missing values

"""
we can use raw_data.Fecha.descibe() to get a summary of the date column
"""

"""  
we can see in a specific column if there are missing values with: 
raw_data['column_name'].isna().sum()

If we want to fill the missing values with smth (in this case with the previus value) we can use:
raw_data.Tractores = raw_data.Tractores.fillna(
    method="ffill"
)
"""

In [None]:
# Filter data between 1988 t 1994
data = raw_data[
    (raw_data.Fecha >= "1986-01-31 00:00:00")
    & (raw_data.Fecha <= "1995-12-31 00:00:00")
]

In [None]:
# Show time series
data.Tractores.plot(
    x="Fecha", y="Tractores", kind="line", title="Tractores matriculados por año"
)
plt.show()

In [None]:
import scipy.stats as stats
import pylab

# Check for normality
stats.probplot(data.Tractores, dist="norm", plot=pylab)
pylab.show()

In [None]:
data.head()

In [None]:
# Set date as index column and sort by date
data.set_index("Fecha", inplace=True)
data.head()

In [None]:
# data = data.asfreq("m")  # set frequency to monthly data
# data.head(50)

"""No funciona, no se puede convertir a frecuencia mensual -- da error"""

In [None]:
"""For delete column: 
del data['Tractores']

For add column for example: 
data['name new column'] = data['Tractores'] # copy the column Tractores to the new column
"""

### Split information in training set (80% data) and test set (20% data)

In [None]:
size_train = int(len(data) * 0.8)

train = data.iloc[:size_train]
test = data.iloc[size_train:]

test.head()

#### White noise

In [None]:
WN = np.random.normal(
    loc=train.Tractores.mean(), scale=train.Tractores.std(), size=len(train)
)  # create white noise for comparison

train["WN"] = WN  # add white noise to data
train.describe()  # for check the values

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(WN, label="White Noise")
plt.plot(train.Tractores, label="Tractores")
plt.legend()
plt.show()

In [None]:
# add addfuller
import statsmodels.tsa.stattools as sts

sts.adfuller(train.Tractores)

## Determinamos si el modelo es aditivo o multiplicativo

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

# Realizar la descomposición aditiva con los datos interpolados
decomposition_add = seasonal_decompose(data["Tractores"], model="additive", period=12)

# Graficar la descomposición aditiva
plt.figure(figsize=(12, 10))

plt.subplot(411)
plt.plot(data["Tractores"], label="Original")
plt.legend(loc="upper left")

plt.subplot(412)
plt.plot(decomposition_add.trend, label="Tendencia")
plt.legend(loc="upper left")

plt.subplot(413)
plt.plot(decomposition_add.seasonal, label="Estacionalidad")
plt.legend(loc="upper left")

plt.subplot(414)
plt.plot(decomposition_add.resid, label="Residuos")
plt.legend(loc="upper left")

plt.tight_layout()
plt.show()

In [None]:
# Realizar la descomposición multiplicativa con los datos interpolados
decomposition_mult = seasonal_decompose(
    data["Tractores"], model="multiplicative", period=12
)

# Graficar la descomposición multiplicativa
plt.figure(figsize=(12, 10))

plt.subplot(411)
plt.plot(data["Tractores"], label="Original")
plt.legend(loc="upper left")

plt.subplot(412)
plt.plot(decomposition_mult.trend, label="Tendencia")
plt.legend(loc="upper left")

plt.subplot(413)
plt.plot(decomposition_mult.seasonal, label="Estacionalidad")
plt.legend(loc="upper left")

plt.subplot(414)
plt.plot(decomposition_mult.resid, label="Residuos")
plt.legend(loc="upper left")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

rolling_window = 12
rolling_mean = data["Tractores"].rolling(window=rolling_window).mean()
rolling_range = (
    data["Tractores"].rolling(window=rolling_window).apply(lambda x: x.max() - x.min())
)

# Eliminar NaN values resulted from rolling window operations
valid_idx = rolling_mean.dropna().index
valid_rolling_mean = rolling_mean.loc[valid_idx]
valid_rolling_range = rolling_range.loc[valid_idx]

# Reshape data for linear regression
X = valid_rolling_mean.values.reshape(-1, 1)
y = valid_rolling_range.values

# Ajustar el modelo de regresión lineal
model = LinearRegression()
model.fit(X, y)

# Predecir valores usando el modelo ajustado
y_pred = model.predict(X)

# Graficar el rango vs la media con la recta de mínimos cuadrados
plt.figure(figsize=(10, 6))
plt.scatter(valid_rolling_mean, valid_rolling_range, alpha=0.5, label="Datos")
plt.plot(valid_rolling_mean, y_pred, color="red", label="Recta de Mínimos Cuadrados")
plt.title("Gráfico Rango vs Media con Recta de Mínimos Cuadrados")
plt.xlabel("Media (tractores)")
plt.ylabel("Rango (máximo - mínimo)")
plt.legend()
plt.grid(True)
plt.show()

print(f"Coeficiente de determinación R^2: {model.score(X, y)}")
print(f"Pendiente de la recta: {model.coef_[0]}")
print(f"Intercepto de la recta: {model.intercept_}")
print(
    f"Recta de mínimos cuadrados: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}\n"
)

print(
    "Podemos concluir con este y los anteriores gráficos que el mejor modelo que se ajusta es uno multiplicativo "
)