In [1]:
# Insert the library
import boto3
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import pickle as pkl

# # Initialization boto3
# s3_client = boto3.client('s3')

# # Set the variable bucket and file
# bucket_name = 'yourfirehosebucket'
# bucket_models = 'yourmodelsbucket'
# file_key = 'yourdatajson'

# # Read file JSON from S3 Bucket
# response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
# content = response['Body']
# json_data = json.loads(content.read())

# Convert JSON to Data Frame
df = pd.read_json("./data.json")
print(df.head())

# Convert timestamp coloumn to Datetime Format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Convert coloumn timestamp to integer format(number of seconds since epoch)
df['timestamp'] = df['timestamp'].astype('int64') // 10**9

# Define features all data and the target is fire_intensity
features = ['timestamp', 'temperature', 'humidity', 'fire_intensity', 'gas_concentration', 'wind_speed', 'distance']
target = 'fire_intensity'

# Delete coloumn target from features
X = df[features].drop(columns=[target])
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction test data
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Visualization prediction and actual result
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual Fire Intensity')
plt.ylabel('Predicted Fire Intensity')
plt.title('Actual vs Predicted Fire Intensity')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.show()

# Save model to pkl local file
with open("model.pkl", "wb") as file:
  pkl.dump(model, file)

# Upload model to S3 Model



                   timestamp  temperature  humidity  fire_intensity  \
0 2024-12-31 14:20:07.962379        25.25     49.92           33.35   
1 2024-12-31 14:20:17.963108        24.58     51.18           91.16   
2 2024-12-31 14:20:27.965962        20.17     45.34           61.96   
3 2024-12-31 14:20:37.967491        25.61     44.25           31.03   
4 2024-12-31 14:20:47.969466        26.92     48.97           21.80   

   gas_concentration  distance  wind_speed  
0              23.65    480.46         NaN  
1               9.98    193.49         NaN  
2             292.84     33.01         NaN  
3             444.77     45.70         NaN  
4             333.97     21.02         NaN  


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Insert the library
import boto3
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import pickle as pkl

# Initialization boto3
s3_client = boto3.client('s3')

# Set the variable bucket and file
# bucket_name = 'lks-bucketfirehose-jawatimur'
# bucket_models = 'lks-bucketmodel-jawatimur'
# file_key = 'iot-data/json/data.json'

# # Read file JSON from S3 Bucket
# response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
# content = response['Body']
# json_data = json.loads(content.read())

# Convert JSON to Data Frame
df = pd.read_json("s3://lks-bucketfirehose-jawatimur/iot-data/json/data.json")
print(df.head())

# Convert timestamp coloumn to Datetime Format
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Convert coloumn timestamp to integer format(number of seconds since epoch)
df['timestamp'] = df['timestamp'].astype('int64') // 10**9

# Define features all data and the target is fire_intensity
features = ['timestamp', 'temperature', 'humidity', 'fire_intensity', 'gas_concentration', 'wind_speed', 'distance']
target = 'fire_intensity'

# Delete coloumn target from features
X = df[features].drop(columns=[target])
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Prediction test data
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Visualization prediction and actual result
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual Fire Intensity')
plt.ylabel('Predicted Fire Intensity')
plt.title('Actual vs Predicted Fire Intensity')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.show()

# Save model to pkl local file
with open("model.pkl", "wb") as file:
  pkl.dump(model, file)

# Upload model to S3 ModelR
s3_client.upload_file("model.pkl", "lks-bucketmodel-jawatimur", "model.pkl")