In [1]:
import pandas as pd
import numpy as np

In [2]:
smog_4_models = pd.read_excel('Smog.xlsx')

In [3]:
smog_4_models.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8771 entries, 0 to 8770
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Unnamed: 0       8771 non-null   int64         
 1   Data             8771 non-null   datetime64[ns]
 2   Dwutlenek_azotu  8771 non-null   float64       
 3   Tlenki_azotu     8771 non-null   float64       
 4   PM10             8771 non-null   float64       
 5   PM_2_5           8771 non-null   float64       
 6   Benzen           8771 non-null   float64       
 7   Tlenek_wegla     8771 non-null   float64       
 8   Kierunek_wiatru  8771 non-null   float64       
 9   Predkosc_wiatru  8771 non-null   float64       
 10  Temperatura      8771 non-null   float64       
 11  Wilgotnosc       8771 non-null   float64       
 12  Cisnienie        8771 non-null   float64       
dtypes: datetime64[ns](1), float64(11), int64(1)
memory usage: 890.9 KB


In [4]:
smog_4_models['Rok'] = pd.DatetimeIndex(smog_4_models['Data']).year
smog_4_models['Miesiac'] = pd.DatetimeIndex(smog_4_models['Data']).month
smog_4_models['Dzien'] = pd.DatetimeIndex(smog_4_models['Data']).day
smog_4_models['Godzina'] = pd.DatetimeIndex(smog_4_models['Data']).hour
smog_4_models = smog_4_models.drop(['Data','Rok'],axis=1)

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
split = StratifiedShuffleSplit(n_splits=1,test_size = 0.2,random_state=42)
for train_index, test_index in split.split(smog_4_models,smog_4_models['Miesiac']):
    strat_train_set = smog_4_models.loc[train_index]
    strat_test_set = smog_4_models.loc[test_index]

In [7]:
X = strat_train_set.drop('PM10',axis = 1)
y = strat_train_set['PM10'].copy()

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
smog_scaled = scaler.fit_transform(X)

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
lin_reg = LinearRegression()
lin_reg.fit(X,y)

LinearRegression()

In [11]:
some_X = X.iloc[:5]
some_y = y.iloc[:5]
print('Prognozy: ',lin_reg.predict(some_X))
print('Rzeczywiste: ',list(some_y))

Prognozy:  [53.00726313 25.65279379 42.7331483  19.41986257 33.21748634]
Rzeczywiste:  [33.6, 21.7, 41.1, 17.8, 23.6]


In [12]:
from sklearn.metrics import mean_squared_error

smog_predictions = lin_reg.predict(X)
lin_mse = mean_squared_error(y, smog_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

10.99581566293359

In [13]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(y, smog_predictions)
lin_mae

7.348680745448556

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, y)

DecisionTreeRegressor(random_state=42)

In [15]:
smog_predictions = tree_reg.predict(X)
tree_mse = mean_squared_error(y, smog_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.2037829569420853e-15

In [16]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, X, y,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [17]:
def display_scores(scores):
    print("Wyniki:", scores)
    print("Średnia:", scores.mean())
    print("Odchylenie standardowe:", scores.std())

display_scores(tree_rmse_scores)

Wyniki: [ 9.75972809 10.304247    8.62263747  9.65330922  9.50856561  9.91443514
  9.60093875 10.37016205 10.22739215  9.69023666]
Średnia: 9.765165214158271
Odchylenie standardowe: 0.48051061677795187


In [18]:
lin_scores = cross_val_score(lin_reg, X, y,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Wyniki: [10.88539972 11.13840773 11.20915309 10.05417988 11.19348228 11.06999044
 10.68909898 10.94386327 12.18227652 10.82309117]
Średnia: 11.018894307136362
Odchylenie standardowe: 0.5038114737099317


In [19]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X, y)

RandomForestRegressor(random_state=42)

In [20]:
smog_predictions = forest_reg.predict(X)
forest_mse = mean_squared_error(y, smog_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse_scores = np.sqrt(-lin_scores)
forest_rmse

2.485269877348702

In [21]:
forest_scores = cross_val_score(forest_reg, X, y,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Wyniki: [6.57288765 6.60881089 6.58348562 7.05969271 6.645026   6.66162908
 6.08885641 7.01839977 7.12766555 6.82572857]
Średnia: 6.719218223698904
Odchylenie standardowe: 0.29013943567523687


Poniższy kod trzeba puścić w colabie na GPU:

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(X, y)
smog_predictions = svm_reg.predict(X)
svm_mse = mean_squared_error(y, smog_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse