In [None]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

In [2]:
# Install required packages (run this if libraries are missing)
import sys
import subprocess

# List of required packages
packages = ['matplotlib', 'seaborn', 'scikit-learn', 'pandas', 'numpy', 'joblib']

for package in packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("All packages installed successfully!")

Installing matplotlib...
Installing seaborn...
Installing scikit-learn...
All packages installed successfully!


In [3]:
# Import core libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Try importing visualization libraries (optional)
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    VISUALIZATION_AVAILABLE = True
    print("All libraries imported successfully (with visualization support)!")
except ImportError:
    VISUALIZATION_AVAILABLE = False
    print("All libraries imported successfully (visualization disabled - install matplotlib and seaborn for plots)")

All libraries imported successfully (with visualization support)!


In [6]:
import pandas as pd
import os

# Specify the file path (modify this path as needed)
file_path = r"D:\Major ML Models\Datasets\tempandhumidity.csv"  # Change this to your file location
# Example paths:
# file_path = "C:/Users/YourName/Desktop/tempandhumidity.csv"  # Windows
# file_path = "/home/username/data/tempandhumidity.csv"        # Linux
# file_path = "/Users/username/Documents/tempandhumidity.csv"  # Mac

# Check if file exists
if os.path.exists(file_path):
    print(f"‚úì File found at: {file_path}")
    
    # Load the dataset
    data = pd.read_csv(file_path, header=None, names=["id", "Humidity", "Temperature"])
    
    print("‚úì Dataset loaded successfully!")
else:
    print(f"‚úó Error: File not found at {file_path}")
    print("Please check the file path and try again.")

‚úì File found at: D:\Major ML Models\Datasets\tempandhumidity.csv
‚úì Dataset loaded successfully!


In [7]:
# Check if data is loaded
if 'data' in locals():
    print("="*60)
    print("DATASET VERIFICATION")
    print("="*60)
    
    # 1. Dataset shape
    print(f"\n1. Dataset Shape: {data.shape}")
    print(f"   - Total Rows: {data.shape[0]}")
    print(f"   - Total Columns: {data.shape[1]}")
    
    # 2. Column names
    print(f"\n2. Column Names: {list(data.columns)}")
    
    # 3. Data types
    print(f"\n3. Data Types:")
    print(data.dtypes)
    
    # 4. First 10 rows
    print(f"\n4. First 10 Rows:")
    print(data.head(10))
    
    # 5. Last 10 rows
    print(f"\n5. Last 10 Rows:")
    print(data.tail(10))
    
    # 6. Random sample
    print(f"\n6. Random Sample (5 rows):")
    print(data.sample(5))
    
    # 7. Basic statistics
    print(f"\n7. Basic Statistics:")
    print(data.describe())
    
    # 8. Check for null values
    print(f"\n8. Null Values:")
    null_counts = data.isnull().sum()
    print(null_counts)
    if null_counts.sum() == 0:
        print("   ‚úì No null values found")
    
    # 9. Check for zero values (potential sensor errors)
    zero_rows = data[(data["Humidity"] == 0) & (data["Temperature"] == 0)].shape[0]
    print(f"\n9. Zero Value Analysis:")
    print(f"   - Rows with both Humidity=0 and Temperature=0: {zero_rows}")
    print(f"   - Valid data rows: {data.shape[0] - zero_rows}")
    
    # 10. Memory usage
    print(f"\n10. Memory Usage:")
    print(f"    {data.memory_usage(deep=True).sum() / 1024:.2f} KB")
    
    print("\n" + "="*60)
    print("‚úì DATA VERIFICATION COMPLETE")
    print("="*60)
else:
    print("‚úó Error: Dataset not loaded. Please run Cell 1 first.")

DATASET VERIFICATION

1. Dataset Shape: (1741, 3)
   - Total Rows: 1741
   - Total Columns: 3

2. Column Names: ['id', 'Humidity', 'Temperature']

3. Data Types:
id             object
Humidity       object
Temperature    object
dtype: object

4. First 10 Rows:
   id  Humidity  Temperature
0  id  Humidity  Temperature
1   1         0            0
2   2         0            0
3   3         0            0
4   4         0            0
5   5         0            0
6   6         0            0
7   7         0            0
8   8         0            0
9   9         0            0

5. Last 10 Rows:
        id Humidity Temperature
1731  1731     65.8        26.9
1732  1732     65.8        26.9
1733  1733     65.8        26.8
1734  1734     65.8        26.8
1735  1735     65.8        26.8
1736  1736     65.8        26.9
1737  1737     65.7        26.8
1738  1738     65.7        26.8
1739  1739     65.7        26.8
1740  1740     65.7        26.9

6. Random Sample (5 rows):
        id Humidity Te

In [8]:
# Remove rows where both Humidity and Temperature are zero (invalid sensor readings)
data_clean = data[(data["Humidity"] != 0) & (data["Temperature"] != 0)].copy()

# Drop the ID column (not needed for modeling)
data_clean = data_clean.drop(columns=["id"])

# Reset index after cleaning
data_clean = data_clean.reset_index(drop=True)

print("DATA CLEANING SUMMARY")
print("="*60)
print(f"Original Dataset Size: {data.shape[0]} rows")
print(f"Cleaned Dataset Size: {data_clean.shape[0]} rows")
print(f"Rows Removed: {data.shape[0] - data_clean.shape[0]}")
print(f"Data Retention Rate: {(data_clean.shape[0]/data.shape[0])*100:.2f}%")

print("\n‚úì Cleaned Data Preview:")
print(data_clean.head(10))

print("\n‚úì Cleaned Data Statistics:")
print(data_clean.describe())

DATA CLEANING SUMMARY
Original Dataset Size: 1741 rows
Cleaned Dataset Size: 1741 rows
Rows Removed: 0
Data Retention Rate: 100.00%

‚úì Cleaned Data Preview:
   Humidity  Temperature
0  Humidity  Temperature
1         0            0
2         0            0
3         0            0
4         0            0
5         0            0
6         0            0
7         0            0
8         0            0
9         0            0

‚úì Cleaned Data Statistics:
       Humidity Temperature
count      1741        1741
unique      239          73
top           0           0
freq        721         721


In [10]:
# Convert columns to numeric, coercing errors to NaN
data_clean["Humidity"] = pd.to_numeric(data_clean["Humidity"], errors='coerce')
data_clean["Temperature"] = pd.to_numeric(data_clean["Temperature"], errors='coerce')

# Drop rows with NaN values after conversion
data_clean = data_clean.dropna(subset=["Humidity", "Temperature"]).reset_index(drop=True)

# Now proceed with feature split
X = data_clean[["Humidity"]].copy()
y = data_clean["Temperature"].copy()

print("FEATURE PREPARATION")
print("="*60)

print(f"\nFeatures (X) - Humidity:")
print(f"  Shape: {X.shape}")
print(f"  Type: {type(X)}")
print(f"  Column names: {list(X.columns)}")
print(f"  Range: {X['Humidity'].min():.2f} to {X['Humidity'].max():.2f}")
print(f"  First 5 values:\n{X.head()}")

print(f"\nTarget (y) - Temperature:")
print(f"  Shape: {y.shape}")
print(f"  Type: {type(y)}")
print(f"  Range: {y.min():.2f} to {y.max():.2f}")
print(f"  First 5 values:\n{y.head()}")

print("\n" + "="*60)
print("‚úì Data ready for train-test split!")

FEATURE PREPARATION

Features (X) - Humidity:
  Shape: (1740, 1)
  Type: <class 'pandas.core.frame.DataFrame'>
  Column names: ['Humidity']
  Range: 0.00 to 89.50
  First 5 values:
   Humidity
0       0.0
1       0.0
2       0.0
3       0.0
4       0.0

Target (y) - Temperature:
  Shape: (1740,)
  Type: <class 'pandas.core.series.Series'>
  Range: 0.00 to 31.80
  First 5 values:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Temperature, dtype: float64

‚úì Data ready for train-test split!


In [11]:
data_clean = data[(data["Humidity"] != 0) & (data["Temperature"] != 0)].copy()

In [13]:
import pandas as pd
import numpy as np

print("DATA CLEANING - REMOVING ZERO VALUES")
print("="*70)

# Convert columns to numeric type
data_clean["Humidity"] = pd.to_numeric(data_clean["Humidity"], errors='coerce')
data_clean["Temperature"] = pd.to_numeric(data_clean["Temperature"], errors='coerce')

print(f"\n1. After numeric conversion:")
print(f"   Dataset shape: {data_clean.shape}")

# Remove rows where BOTH Humidity AND Temperature are 0
initial_rows = data_clean.shape[0]
data_clean = data_clean[(data_clean["Humidity"] != 0) & (data_clean["Temperature"] != 0)]

print(f"\n2. After removing rows with both Humidity=0 and Temperature=0:")
print(f"   Rows removed: {initial_rows - data_clean.shape[0]}")
print(f"   Dataset shape: {data_clean.shape}")

# Remove rows with any NaN values
initial_rows = data_clean.shape[0]
data_clean = data_clean.dropna(subset=["Humidity", "Temperature"])

print(f"\n3. After removing NaN values:")
print(f"   Rows removed: {initial_rows - data_clean.shape[0]}")
print(f"   Dataset shape: {data_clean.shape}")

# Reset index
data_clean = data_clean.reset_index(drop=True)

print(f"\n" + "="*70)
print("FINAL DATA STATISTICS")
print("="*70)

print(f"\nDataset Shape: {data_clean.shape[0]} rows √ó {data_clean.shape[1]} columns")
print(f"\nHumidity Statistics:")
print(f"  Min:    {data_clean['Humidity'].min():.2f}%")
print(f"  Max:    {data_clean['Humidity'].max():.2f}%")
print(f"  Mean:   {data_clean['Humidity'].mean():.2f}%")
print(f"  Std:    {data_clean['Humidity'].std():.2f}%")
print(f"  Median: {data_clean['Humidity'].median():.2f}%")

print(f"\nTemperature Statistics:")
print(f"  Min:    {data_clean['Temperature'].min():.2f}¬∞C")
print(f"  Max:    {data_clean['Temperature'].max():.2f}¬∞C")
print(f"  Mean:   {data_clean['Temperature'].mean():.2f}¬∞C")
print(f"  Std:    {data_clean['Temperature'].std():.2f}¬∞C")
print(f"  Median: {data_clean['Temperature'].median():.2f}¬∞C")

print(f"\n" + "="*70)
print("FINAL DATA PREVIEW")
print("="*70)

print(f"\nFirst 15 rows:")
print(data_clean.head(15))

print(f"\nLast 15 rows:")
print(data_clean.tail(15))

print(f"\nRandom sample (10 rows):")
print(data_clean.sample(min(10, len(data_clean))))

print(f"\n" + "="*70)
print("FEATURE AND TARGET PREPARATION")
print("="*70)

# Prepare features and target
X = data_clean[["Humidity"]].copy()
y = data_clean["Temperature"].copy()

print(f"\nFeatures (X) - Humidity:")
print(f"  Shape: {X.shape}")
print(f"  Data type: {X['Humidity'].dtype}")
print(f"  Range: {X['Humidity'].min():.2f}% to {X['Humidity'].max():.2f}%")

print(f"\nTarget (y) - Temperature:")
print(f"  Shape: {y.shape}")
print(f"  Data type: {y.dtype}")
print(f"  Range: {y.min():.2f}¬∞C to {y.max():.2f}¬∞C")

print(f"\n" + "="*70)
print("‚úì DATA CLEANING COMPLETE - READY FOR TRAIN-TEST SPLIT!")
print("="*70)

DATA CLEANING - REMOVING ZERO VALUES

1. After numeric conversion:
   Dataset shape: (1741, 3)

2. After removing rows with both Humidity=0 and Temperature=0:
   Rows removed: 721
   Dataset shape: (1020, 3)

3. After removing NaN values:
   Rows removed: 1
   Dataset shape: (1019, 3)

FINAL DATA STATISTICS

Dataset Shape: 1019 rows √ó 3 columns

Humidity Statistics:
  Min:    5.70%
  Max:    89.50%
  Mean:   70.99%
  Std:    9.78%
  Median: 69.60%

Temperature Statistics:
  Min:    1.00¬∞C
  Max:    31.80¬∞C
  Mean:   27.17¬∞C
  Std:    3.41¬∞C
  Median: 27.00¬∞C

FINAL DATA PREVIEW

First 15 rows:
    id  Humidity  Temperature
0   10      82.9         25.2
1   11      85.1         25.3
2   12      85.8         25.4
3   13      85.8         25.5
4   14      86.0         25.6
5   15      85.8         25.7
6   16      85.9         25.8
7   17      85.9         25.8
8   18      85.7         25.8
9   19      85.7         25.9
10  20      85.7         25.9
11  21      85.7         25.9
12 

In [14]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 20% for testing
    random_state=42,    # For reproducibility
    shuffle=True        # Shuffle before splitting
)

print("TRAIN-TEST SPLIT")
print("="*60)
print(f"Training Set:")
print(f"  X_train shape: {X_train.shape} ({(X_train.shape[0]/X.shape[0])*100:.1f}%)")
print(f"  y_train shape: {y_train.shape}")

print(f"\nTesting Set:")
print(f"  X_test shape: {X_test.shape} ({(X_test.shape[0]/X.shape[0])*100:.1f}%)")
print(f"  y_test shape: {y_test.shape}")

print(f"\nTotal samples: {X.shape[0]}")
print("‚úì Data split successfully!")

TRAIN-TEST SPLIT
Training Set:
  X_train shape: (815, 1) (80.0%)
  y_train shape: (815,)

Testing Set:
  X_test shape: (204, 1) (20.0%)
  y_test shape: (204,)

Total samples: 1019
‚úì Data split successfully!


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize models with optimized hyperparameters for high accuracy
models = {
    "Linear Regression": LinearRegression(),
    
    "Random Forest": RandomForestRegressor(
        n_estimators=200,        # Number of trees
        max_depth=10,            # Maximum tree depth
        min_samples_split=5,     # Minimum samples to split
        min_samples_leaf=2,      # Minimum samples per leaf
        random_state=42,
        n_jobs=-1                # Use all CPU cores
    ),
    
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=200,        # Number of boosting stages
        learning_rate=0.1,       # Learning rate
        max_depth=5,             # Maximum tree depth
        min_samples_split=5,
        random_state=42
    )
}

# Train and evaluate each model
results = {}

print("MODEL TRAINING AND EVALUATION")
print("="*60)

for name, model in models.items():
    print(f"\nüîÑ Training {name}...")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    # Calculate accuracy percentage (custom metric for regression)
    # Accuracy = 100 - (mean absolute percentage error)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    accuracy = 100 - mape
    
    # Store results
    results[name] = {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "Accuracy": accuracy,
        "Model": model
    }
    
    print(f"‚úì {name} trained successfully!")
    print(f"  MAE:      {mae:.4f}¬∞C (Lower is better)")
    print(f"  RMSE:     {rmse:.4f}¬∞C (Lower is better)")
    print(f"  R¬≤ Score: {r2:.4f} (Higher is better, max=1.0)")
    print(f"  Accuracy: {accuracy:.2f}%")

print("\n" + "="*60)
print("‚úì All models trained successfully!")

MODEL TRAINING AND EVALUATION

üîÑ Training Linear Regression...
‚úì Linear Regression trained successfully!
  MAE:      1.8933¬∞C (Lower is better)
  RMSE:     3.3050¬∞C (Lower is better)
  R¬≤ Score: 0.2529 (Higher is better, max=1.0)
  Accuracy: 72.43%

üîÑ Training Random Forest...
‚úì Random Forest trained successfully!
  MAE:      0.5808¬∞C (Lower is better)
  RMSE:     0.8521¬∞C (Lower is better)
  R¬≤ Score: 0.9503 (Higher is better, max=1.0)
  Accuracy: 97.43%

üîÑ Training Gradient Boosting...
‚úì Gradient Boosting trained successfully!
  MAE:      0.5945¬∞C (Lower is better)
  RMSE:     0.8802¬∞C (Lower is better)
  R¬≤ Score: 0.9470 (Higher is better, max=1.0)
  Accuracy: 97.36%

‚úì All models trained successfully!


In [None]:
# Find the model with the highest R¬≤ score
best_model_name = max(results, key=lambda x: results[x]["R2"])
best_model = results[best_model_name]["Model"]

print("üèÜ BEST MODEL SELECTED")

print(f"\nModel: {best_model_name}")
print(f"\nPerformance Metrics:")
print(f"  R¬≤ Score: {results[best_model_name]['R2']:.4f}")
print(f"  MAE:      {results[best_model_name]['MAE']:.4f}¬∞C")
print(f"  RMSE:     {results[best_model_name]['RMSE']:.4f}¬∞C")
print(f"  Accuracy: {results[best_model_name]['Accuracy']:.2f}%")

print("\n" + "="*60)
print("‚úì Best model ready for deployment!")


üèÜ BEST MODEL SELECTED

Model: Random Forest

Performance Metrics:
  R¬≤ Score: 0.9503
  MAE:      0.5808¬∞C
  RMSE:     0.8521¬∞C
  Accuracy: 97.43%

‚úì Best model ready for deployment!


In [None]:
# Detailed predictions analysis
y_pred_best = best_model.predict(X_test)

# Calculate prediction errors
errors = y_test - y_pred_best
abs_errors = np.abs(errors)

print("PREDICTION ANALYSIS")

print("\nError Statistics:")
print(f"  Mean Error: {errors.mean():.4f}¬∞C")
print(f"  Std Error:  {errors.std():.4f}¬∞C")
print(f"  Max Error:  {abs_errors.max():.4f}¬∞C")
print(f"  Min Error:  {abs_errors.min():.4f}¬∞C")

# Show sample predictions
print("\nSample Predictions (first 10):")
print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12}")

for i in range(min(10, len(y_test))):
    actual = y_test.iloc[i]
    predicted = y_pred_best[i]
    error = actual - predicted
    print(f"{actual:<12.2f} {predicted:<12.2f} {error:<12.2f}")

print("\n‚úì Prediction analysis complete!")

In [None]:
import joblib

# Save the best model as joblib file (recommended for Airflow)
model_filename = "temperature_model.joblib"
joblib.dump(best_model, model_filename)

print(f"‚úì Model saved as: {model_filename}")
print(f"‚úì Model type: {best_model_name}")
print(f"‚úì File size: {os.path.getsize(model_filename) / 1024:.2f} KB")
print(f"‚úì Location: {os.path.abspath(model_filename)}")

In [None]:
# Load the saved model to verify it works
loaded_model = joblib.load("temperature_model.joblib")

# Test predictions with sample humidity values
test_samples = [
    [75.5],   # Sample 1
    [80.0],   # Sample 2
    [85.2],   # Sample 3
    [70.3],   # Sample 4
    [90.0]    # Sample 5
]

print("MODEL TESTING")
print("="*60)
print("‚úì Model loaded successfully!")
print("\nTest Predictions:")
print(f"{'Humidity (%)':<15} {'Predicted Temp (¬∞C)':<20}")
print("-" * 35)

for sample in test_samples:
    predicted_temp = loaded_model.predict([sample])
    print(f"{sample[0]:<15.1f} {predicted_temp[0]:<20.2f}")



MODEL TESTING
‚úì Model loaded successfully!

Test Predictions:
Humidity (%)    Predicted Temp (¬∞C) 
-----------------------------------
75.5            27.16               
80.0            26.73               
85.2            25.92               
70.3            26.58               
90.0            25.07               

‚úì Model is working correctly!
‚úì Ready for production use in Apache Airflow!


In [None]:
# Load the saved model
loaded_model = joblib.load("temperature_model.joblib")

print("MODEL PREDICTION TESTING")
print("="*70)

# Test with sample humidity values
test_samples = [
    [70.0],   # Low humidity
    [75.5],   # Medium humidity
    [80.0],   # Higher humidity
    [85.2],   # High humidity
    [90.0]    # Very high humidity
]

print("\nTest Predictions with different humidity values:\n")
print(f"{'Humidity (%)':<20} {'Predicted Temp (¬∞C)':<25}")
print("-" * 45)

for sample in test_samples:
    predicted_temp = loaded_model.predict([sample])
    print(f"{sample[0]:<20.1f} {predicted_temp[0]:<25.2f}")

print(f"\n‚úì Model predictions working correctly!")

In [7]:
import pandas as pd

# Update this with the correct file path if needed
file_path = r"D:\Major ML Models\Datasets\tempandhumidity.csv"  # Example path

# Load data
data = pd.read_csv(file_path)

print("Data loaded successfully!")
print(f"Shape: {data.shape}")
print("First few rows:")
print(data.head())

Data loaded successfully!
Shape: (1743, 8)
First few rows:
    id  Humidity  Temperature  Unnamed: 3  Unnamed: 4  Unnamed: 5  Unnamed: 6  \
0  1.0       0.0          0.0         NaN         NaN         NaN         NaN   
1  2.0       0.0          0.0         NaN         NaN         NaN         NaN   
2  3.0       0.0          0.0         NaN         NaN         NaN         NaN   
3  4.0       0.0          0.0         NaN         NaN         NaN         NaN   
4  5.0       0.0          0.0         NaN         NaN         NaN         NaN   

  Unnamed: 7  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  


In [8]:
print("DATA CLEANING AND FEATURE PREPARATION")
print("="*70)

# Remove rows where either Temperature or Humidity is zero
data_reg = data[(data["Temperature"] != 0) & (data["Humidity"] != 0)].copy()

# Feature and Target
X = data_reg[["Temperature"]]  # Input feature(s)
y = data_reg["Humidity"]       # Target variable

print(f"Cleaned dataset shape for regression: {data_reg.shape}")
print(f"Feature shape: {X.shape}, Target shape: {y.shape}")

print("\nSample of cleaned dataset:")
print(data_reg.head())

print("\n‚úì Data cleaning and feature preparation complete!")

DATA CLEANING AND FEATURE PREPARATION
Cleaned dataset shape for regression: (1022, 8)
Feature shape: (1022, 1), Target shape: (1022,)

Sample of cleaned dataset:
      id  Humidity  Temperature  Unnamed: 3  Unnamed: 4  Unnamed: 5  \
9   10.0      82.9         25.2         NaN         NaN         NaN   
10  11.0      85.1         25.3         NaN         NaN         NaN   
11  12.0      85.8         25.4         NaN         NaN         NaN   
12  13.0      85.8         25.5         NaN         NaN         NaN   
13  14.0      86.0         25.6         NaN         NaN         NaN   

    Unnamed: 6 Unnamed: 7  
9          NaN        NaN  
10         NaN        NaN  
11         NaN        NaN  
12         NaN        NaN  
13         NaN        NaN  

‚úì Data cleaning and feature preparation complete!


In [9]:
# Keep only 'Humidity' and 'Temperature' columns
data_clean = data.loc[:, ['Humidity', 'Temperature']]

# Drop rows with missing or null values if any
data_clean = data_clean.dropna().reset_index(drop=True)

print("Cleaned Data Sample:")
print(data_clean.head())

print(f"\nData Shape: {data_clean.shape}")

Cleaned Data Sample:
   Humidity  Temperature
0       0.0          0.0
1       0.0          0.0
2       0.0          0.0
3       0.0          0.0
4       0.0          0.0

Data Shape: (1740, 2)


In [10]:
# Remove rows where either Humidity or Temperature is zero
data_no_zeros = data_clean[(data_clean['Humidity'] != 0) & (data_clean['Temperature'] != 0)].copy()

# Reset index after filtering
data_no_zeros = data_no_zeros.reset_index(drop=True)

print("Data after removing zeros:")
print(data_no_zeros.head())

print(f"\nData shape after removing zeros: {data_no_zeros.shape}")

Data after removing zeros:
   Humidity  Temperature
0      82.9         25.2
1      85.1         25.3
2      85.8         25.4
3      85.8         25.5
4      86.0         25.6

Data shape after removing zeros: (1019, 2)


In [11]:
from sklearn.model_selection import train_test_split

# Features (Temperature) and Target (Humidity)
X = data_no_zeros[["Temperature"]]
y = data_no_zeros["Humidity"]

# Split data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print("Train-Test Split Summary:")
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

print("\nSample training features:")
print(X_train.head())

print("\nSample training targets:")
print(y_train.head())

print("\n‚úì Train-test split complete and ready for model training!")

Train-Test Split Summary:
Training samples: 815
Testing samples: 204

Sample training features:
     Temperature
837         26.3
137         26.8
694         30.3
667         29.5
926         26.8

Sample training targets:
837    69.6
137    81.6
694    61.8
667    64.0
926    66.3
Name: Humidity, dtype: float64

‚úì Train-test split complete and ready for model training!


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    y_pred = model.predict(X_test)
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    # Store results
    results[name] = {"mae": mae, "rmse": rmse, "r2": r2}
    print(f"\n{name} Model Performance:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R¬≤:   {r2:.4f}")


Linear Regression Model Performance:
  MAE:  6.8994
  RMSE: 9.2086
  R¬≤:   0.2550

Random Forest Model Performance:
  MAE:  3.1732
  RMSE: 4.7831
  R¬≤:   0.7990

Gradient Boosting Model Performance:
  MAE:  3.1860
  RMSE: 4.8032
  R¬≤:   0.7973


In [15]:
import joblib

# Save the best model
best_model = models["Random Forest"]
joblib.dump(best_model, "humidity_from_temperature_model.joblib")

print("Random Forest model saved as 'humidity_from_temperature_model.joblib'")

Random Forest model saved as 'humidity_from_temperature_model.joblib'


In [16]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the saved model
loaded_model = joblib.load("humidity_from_temperature_model.joblib")

# Use your X_test data (Temperature) to do prediction
y_pred = loaded_model.predict(X_test)

# Evaluate predictions
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Prediction Performance on Test Set:")
print(f"  MAE:  {mae:.4f}")
print(f"  RMSE: {rmse:.4f}")
print(f"  R¬≤:   {r2:.4f}")

# Sample predictions vs actual
comparison_df = pd.DataFrame({
    'Temperature (Input)': X_test['Temperature'],
    'Actual Humidity': y_test,
    'Predicted Humidity': y_pred
})

print("\nSample predictions:")
print(comparison_df.head(15))

Prediction Performance on Test Set:
  MAE:  3.1732
  RMSE: 4.7831
  R¬≤:   0.7990

Sample predictions:
     Temperature (Input)  Actual Humidity  Predicted Humidity
522                 27.5             72.1           71.634689
453                 27.9             73.2           72.339525
439                 27.3             72.4           72.271367
31                  26.4             84.7           75.324898
615                 27.6             73.0           71.635550
584                  1.9              5.7           25.379000
442                 27.3             72.5           72.271367
731                 25.6             73.1           87.204997
76                  26.1             81.9           82.203181
493                 27.7             71.4           71.859137
717                 30.1             61.0           63.947436
70                  27.0             81.6           70.710797
879                 26.8             66.7           68.727518
101                 26.7     

In [18]:
import os
import joblib

print("HUMIDITY MODEL VERIFICATION")
print("="*70)

# Check if humidity model file exists
humidity_model_filename = "humidity_from_temperature_model.joblib"

if os.path.exists(humidity_model_filename):
    file_size = os.path.getsize(humidity_model_filename)
    print(f"‚úì Model file found: {humidity_model_filename}")
    print(f"  ‚îú‚îÄ File size: {file_size / 1024:.2f} KB")
    print(f"  ‚îú‚îÄ Location: {os.path.abspath(humidity_model_filename)}")
    print(f"  ‚îî‚îÄ Status: Ready for deployment")
else:
    print(f"‚úó Model file not found: {humidity_model_filename}")

print(f"\n" + "="*70)
print("‚úì Humidity model saved successfully!")
print("="*70)

HUMIDITY MODEL VERIFICATION
‚úì Model file found: humidity_from_temperature_model.joblib
  ‚îú‚îÄ File size: 860.72 KB
  ‚îú‚îÄ Location: C:\Users\mbhar\humidity_from_temperature_model.joblib
  ‚îî‚îÄ Status: Ready for deployment

‚úì Humidity model saved successfully!
