In [1]:
import pandas as pd, pickle,numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

In [32]:

df = pd.read_csv('severity-classified-scaled.csv')

# Create a new column 'bz_gsm_h' with the bz_gsm value from the next row
df['bz_gsm_h'] = df['bz_gsm'].shift(-1)

# Fill the last row and any missing values in 'bz_gsm_h' with the mean of 'bz_gsm'
df['bz_gsm_h'].fillna(df['bz_gsm'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bz_gsm_h'].fillna(df['bz_gsm'].mean(), inplace=True)


In [33]:
# Using only specified columns for training
columns_to_use = ['month', 'day', 'hour', 'bx_gsm', 'by_gsm', 'bz_gsm', 'bt', 'speed', 'density', 'temp']
X = df[columns_to_use]
y = df['bz_gsm_h']

In [34]:
model = LinearRegression()

mse_scorer = make_scorer(mean_squared_error)
mse_scores = cross_val_score(model, X, y, cv=5, scoring=mse_scorer)
print(f'5-Fold Cross-Validation MSE Scores: {mse_scores}')
print(f'Mean MSE: {mse_scores.mean()}')

5-Fold Cross-Validation MSE Scores: [0.00244777 0.0021526  0.00195366 0.00185627 0.00150665]
Mean MSE: 0.001983393429760459


In [35]:
model.fit(X,y)

# Save the model, scaler, and imputer
with open('regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

## Let's try inference the model

In [44]:
# Load the scaler, imputer, and model
with open('./minmax_scaler.pkl', 'rb') as f:
    loaded_scaler = pickle.load(f)

with open('./simple_imputer.pkl', 'rb') as f:
    loaded_imputer = pickle.load(f)

with open('./regression_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# New data point
X_new_raw = np.array([[10, 11, 8, -3.04, 1.73, 2.49, 8.05, 389.8, 16.981, 136272.0, 57307, 84.101, 89.864, 14, 135, 57307, 136]])

# Define column names
column_names = [
    'month', 'day', 'hour', 'bx_gsm', 'by_gsm', 'bz_gsm', 'bt', 'speed', 'density', 'temp',
    'intensity', 'declination', 'inclination', 'north', 'east', 'vertical', 'horizontal'
]

# Create DataFrame for the new data point
X_new_df = pd.DataFrame(X_new_raw, columns=column_names)

# Apply imputer and scaler transformations with the correct format
X_new_imputed = pd.DataFrame(loaded_imputer.transform(X_new_df), columns=column_names)
X_new_scaled = pd.DataFrame(loaded_scaler.transform(X_new_imputed), columns=column_names)

# Drop unnecessary columns
X_new_adjusted = X_new_scaled.drop(['intensity', 'declination', 'inclination', 'north', 'east', 'vertical', 'horizontal'], axis=1)

# Predict bz_gsm_h using the loaded model
bz_gsm_h = loaded_model.predict(X_new_adjusted)
print(f'Predicted bz_gsm_h: {bz_gsm_h[0]}')

Predicted bz_gsm_h: 0.14553198346006863


In [45]:
bz_gsm_backup = X_new_scaled['bz_gsm']

# Prepare DataFrame with all features
X_new_scaled['bz_gsm'] = bz_gsm_h

# Inverse transform to get back the original value
X_new_original = loaded_scaler.inverse_transform(X_new_scaled)
bz_gsm_h_original = X_new_original[0, column_names.index('bz_gsm')]
print(f'Actual bz_gsm_h value: {bz_gsm_h_original}')
X_new_original = pd.DataFrame(X_new_original,columns=column_names)

Actual bz_gsm_h value: 2.133498877524606


In [46]:
X_new_scaled['bz_gsm'] = bz_gsm_backup

# Inverse transform to get back the original value
X_new_original = pd.DataFrame(loaded_scaler.inverse_transform(X_new_scaled),columns=column_names)

X_new_original['bz_gsm_h'] = bz_gsm_h_original
print(X_new_original.head())

   month   day  hour  bx_gsm  by_gsm  bz_gsm    bt  speed  density      temp  \
0   10.0  11.0   8.0   -3.04    1.73    2.49  8.05  389.8   16.981  136272.0   

   intensity  declination  inclination  north   east  vertical  horizontal  \
0    57307.0       84.101       89.864   14.0  135.0   57307.0       136.0   

   bz_gsm_h  
0  2.133499  


In [39]:
# Index of 'bz_gsm' in the list of column names
bz_gsm_index = column_names.index('bz_gsm')

# Get the min and max values for bz_gsm
bz_gsm_min = loaded_scaler.data_min_[bz_gsm_index]
bz_gsm_max = loaded_scaler.data_max_[bz_gsm_index]

print(f'Min value for bz_gsm: {bz_gsm_min}')
print(f'Max value for bz_gsm: {bz_gsm_max}')


Min value for bz_gsm: 0.0
Max value for bz_gsm: 14.66
