In [61]:
# Imports.
import pandas as pd
from pandas.core.frame import DataFrame
from sklearn import linear_model

import utils.invariants as invariants
import utils.types as types

In [62]:
# Aliases.
LinearRegr = linear_model.LinearRegression

In [63]:
# Read data.
def get_dataframe(filepath: types.Path) -> DataFrame:
    return pd.read_csv(filepath)

In [64]:
# Regression function.
def do_regression(independent, dependent):
    # type: (types.IndependentVars, types.DependentVar) -> LinearRegr
    model = LinearRegr()
    model.fit(independent, dependent)
    return model

In [65]:
# Error discovering.
def get_average_err(model, content, affecting_keys, investigated_key):
    # type: (LinearRegr, DataFrame, list[str], str) -> float
    for i in range(len(content.index)):
        predicted = model.predict([
            [content[key].values[i] for key in affecting_keys]
        ])
        veritable = content[investigated_key].values[i]
        curr_err = abs(predicted - veritable)
        aver_err = curr_err if i == 0 else (aver_err + curr_err) / 2
    return aver_err.item()

In [66]:
# Entry point.
if __name__ == '__main__':
    train_dataframe = get_dataframe('data/weather.csv')
    tests_dataframe = get_dataframe('data/weather-test.csv')

    # Get all model variables.
    keys = invariants.feature_keys

    # Define from which key the indepenent variables are placed.
    key_from = 3  # Skip 'T (degC)', 'Tpot (K)' and 'Tdew (degC)'.

    # Define dependent and independent variables.
    independent = train_dataframe[keys[key_from:]]
    dependent = train_dataframe[keys[0]]

    # Define prediction model.
    model = do_regression(independent, dependent)

    # Get average error rate.
    error = get_average_err(
        model=model,
        content=tests_dataframe,
        affecting_keys=invariants.feature_keys[3:],
        investigated_key=invariants.investigated_key
    )

    # Get model accuracy.
    accuracy = (1 - error) * 100

    # Outputs.
    print(f'Train dataset size: {len(train_dataframe.index)}')
    print(f'Tests dataset size: {len(tests_dataframe.index)}')
    print(f'Avarage error: {round(error, 2)} (degC)')
    print(f'Model accuracy: {round(accuracy, 2)}%')



Train dataset size: 415754
Tests dataset size: 4797
Avarage error: 0.08 (degC)
Model accuracy: 92.33%
