In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the CSV file
data = pd.read_csv('C:/Users/ploke/Music/Weather_Data.csv')

# Strip any leading/trailing whitespace characters from column names
data.columns = data.columns.str.strip()

# Print column names to verify
print(data.columns)

# Check if the required columns exist
required_columns = ['MaxT', 'MinT', 'WindSpeed', 'Humidity']
if all(column in data.columns for column in required_columns):
    # Fill missing values (forward fill)
    data.ffill(inplace=True)
    
    # Calculate average temperature
    data['AvgT'] = (data['MaxT'] + data['MinT']) / 2
    
    # Feature engineering (example: compute a heat index-like feature)
    data['heat_index'] = 0.5 * (data['AvgT'] + 61.0 + ((data['AvgT'] - 68.0) * 1.2) + (data['Humidity'] * 0.094))
    
    # Normalize the data
    scaler = MinMaxScaler()
    data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']] = scaler.fit_transform(data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']])
    
    print(data.head())
else:
    print("One or more required columns ('MaxT', 'MinT', 'WindSpeed', 'Humidity') are missing in the dataset.")


Index(['Date', 'MaxT', 'MinT', 'WindSpeed', 'Humidity', 'Precipitation'], dtype='object')
                  Date      MaxT      MinT  WindSpeed  Humidity  \
0  2006-01-08 00:00:00  0.291188  0.397849   0.148649  0.614551   
1  2006-01-09 00:00:00  0.314176  0.354839   0.148649  0.571207   
2  2006-01-10 00:00:00  0.306513  0.360215   0.162162  0.537152   
3  2006-01-11 00:00:00  0.302682  0.344086   0.135135  0.595975   
4  2006-01-12 00:00:00  0.295019  0.327957   0.216216  0.603715   

   Precipitation      AvgT  heat_index  
0            0.0  0.269129    0.285281  
1            0.0  0.263852    0.272804  
2            0.0  0.261214    0.264623  
3            0.0  0.250660    0.262485  
4            0.0  0.237467    0.249496  


In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the CSV file
data = pd.read_csv('C:/Users/ploke/Music/Weather_Data.csv')

# Strip any leading/trailing whitespace characters from column names
data.columns = data.columns.str.strip()

# Print column names to verify
print(data.columns)

# Check if the required columns exist
required_columns = ['MaxT', 'MinT', 'WindSpeed', 'Humidity']
if all(column in data.columns for column in required_columns):
    # Fill missing values (forward fill)
    data.ffill(inplace=True)
    
    # Calculate average temperature
    data['AvgT'] = (data['MaxT'] + data['MinT']) / 2
    
    # Feature engineering (example: compute a heat index-like feature)
    data['heat_index'] = 0.5 * (data['AvgT'] + 61.0 + ((data['AvgT'] - 68.0) * 1.2) + (data['Humidity'] * 0.094))
    
    # Normalize the data
    scaler = MinMaxScaler()
    data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']] = scaler.fit_transform(data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']])
    
    # Define the threshold for a heat wave (example: AvgT > 30°C)
    threshold = 30
    
    # Define the target variable
    data['heat_wave'] = (data['AvgT'] > threshold).astype(int)
    
    # Split the data into features and target
    X = data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']]
    y = data['heat_wave']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    # Evaluate the model
    accuracy = clf.score(X_test, y_test)
    print(f"Model Accuracy: {accuracy:.2f}")
else:
    print("One or more required columns ('MaxT', 'MinT', 'WindSpeed', 'Humidity') are missing in the dataset.")


Index(['Date', 'MaxT', 'MinT', 'WindSpeed', 'Humidity', 'Precipitation'], dtype='object')
Model Accuracy: 1.00


In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV file
data = pd.read_csv('C:/Users/ploke/Music/Weather_Data.csv')

# Strip any leading/trailing whitespace characters from column names
data.columns = data.columns.str.strip()

# Print column names to verify
print(data.columns)

# Check if the required columns exist
required_columns = ['MaxT', 'MinT', 'WindSpeed', 'Humidity']
if all(column in data.columns for column in required_columns):
    # Fill missing values (forward fill)
    data.ffill(inplace=True)
    
    # Calculate average temperature
    data['AvgT'] = (data['MaxT'] + data['MinT']) / 2
    
    # Feature engineering (example: compute a heat index-like feature)
    data['heat_index'] = 0.5 * (data['AvgT'] + 61.0 + ((data['AvgT'] - 68.0) * 1.2) + (data['Humidity'] * 0.094))
    
    # Normalize the data
    scaler = MinMaxScaler()
    data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']] = scaler.fit_transform(data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']])
    
    # Define the threshold for a heat wave (example: AvgT > 30°C)
    threshold = 30
    
    # Define the target variable
    data['heat_wave'] = (data['AvgT'] > threshold).astype(int)
    
    # Check the distribution of the target variable
    print(data['heat_wave'].value_counts())
    
    # Split the data into features and target
    X = data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']]
    y = data['heat_wave']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Check the distribution of the predictions
    print(pd.Series(y_pred).value_counts())
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

else:
    print("One or more required columns ('MaxT', 'MinT', 'WindSpeed', 'Humidity') are missing in the dataset.")


Index(['Date', 'MaxT', 'MinT', 'WindSpeed', 'Humidity', 'Precipitation'], dtype='object')
heat_wave
0    6236
Name: count, dtype: int64
0    1248
Name: count, dtype: int64
Accuracy: 1.0
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [8]:
# Define a new, lower threshold for a heat wave
threshold = 15  # Try a further lower threshold

# Apply the new threshold to create the 'heat_wave' target variable
data['heat_wave'] = (data['AvgT'] > threshold).astype(int)

# Check the distribution of the target variable
print(data['heat_wave'].value_counts())


heat_wave
0    6236
Name: count, dtype: int64


In [9]:
# Check for instances where 'AvgT' exceeds the threshold
print(data[data['AvgT'] > threshold].shape)  # Should be non-zero
print(data[data['AvgT'] > threshold][['Date', 'MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_wave']].head())


(0, 9)
Empty DataFrame
Columns: [Date, MaxT, MinT, WindSpeed, Humidity, AvgT, heat_wave]
Index: []


In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV file
data = pd.read_csv('C:/Users/ploke/Music/Weather_Data.csv')

# Strip any leading/trailing whitespace characters from column names
data.columns = data.columns.str.strip()

# Print column names to verify
print(data.columns)

# Check if the required columns exist
required_columns = ['MaxT', 'MinT', 'WindSpeed', 'Humidity']
if all(column in data.columns for column in required_columns):
    # Fill missing values (forward fill)
    data.ffill(inplace=True)
    
    # Calculate average temperature
    data['AvgT'] = (data['MaxT'] + data['MinT']) / 2
    
    # Feature engineering (example: compute a heat index-like feature)
    data['heat_index'] = 0.5 * (data['AvgT'] + 61.0 + ((data['AvgT'] - 68.0) * 1.2) + (data['Humidity'] * 0.094))
    
    # Normalize the data
    scaler = MinMaxScaler()
    data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']] = scaler.fit_transform(data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']])
    
    # Explore the distribution of AvgT to choose an appropriate threshold
    print(data['AvgT'].describe())
    
    # Define the threshold for a heat wave (example: AvgT > 0.75, since data is normalized)
    threshold = 0.75
    
    # Define the target variable
    data['heat_wave'] = (data['AvgT'] > threshold).astype(int)
    
    # Check the distribution of the target variable
    print(data['heat_wave'].value_counts())
    
    # Split the data into features and target
    X = data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']]
    y = data['heat_wave']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Check the distribution of the predictions
    print(pd.Series(y_pred).value_counts())
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

else:
    print("One or more required columns ('MaxT', 'MinT', 'WindSpeed', 'Humidity') are missing in the dataset.")


Index(['Date', 'MaxT', 'MinT', 'WindSpeed', 'Humidity', 'Precipitation'], dtype='object')
count    6236.000000
mean        0.448121
std         0.153598
min         0.000000
25%         0.329815
50%         0.448549
75%         0.548813
max         1.000000
Name: AvgT, dtype: float64
heat_wave
0    6055
1     181
Name: count, dtype: int64
0    1215
1      33
Name: count, dtype: int64
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load the CSV file
data = pd.read_csv('C:/Users/ploke/Music/Weather_Data.csv')

# Strip any leading/trailing whitespace characters from column names
data.columns = data.columns.str.strip()

# Print column names to verify
print(data.columns)

# Check if the required columns exist
required_columns = ['MaxT', 'MinT', 'WindSpeed', 'Humidity']
if all(column in data.columns for column in required_columns):
    # Fill missing values (forward fill)
    data.ffill(inplace=True)
    
    # Calculate average temperature
    data['AvgT'] = (data['MaxT'] + data['MinT']) / 2
    
    # Feature engineering (example: compute a heat index-like feature)
    data['heat_index'] = 0.5 * (data['AvgT'] + 61.0 + ((data['AvgT'] - 68.0) * 1.2) + (data['Humidity'] * 0.094))
    
    # Normalize the data
    scaler = MinMaxScaler()
    data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']] = scaler.fit_transform(data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']])
    
    # Save the scaler
    joblib.dump(scaler, 'scaler.pkl')
    print("Scaler saved as 'scaler.pkl'")
    
    # Explore the distribution of AvgT to choose an appropriate threshold
    print(data['AvgT'].describe())
    
    # Define the threshold for a heat wave (example: AvgT > 0.75, since data is normalized)
    threshold = 0.75
    
    # Define the target variable
    data['heat_wave'] = (data['AvgT'] > threshold).astype(int)
    
    # Check the distribution of the target variable
    print(data['heat_wave'].value_counts())
    
    # Split the data into features and target
    X = data[['MaxT', 'MinT', 'WindSpeed', 'Humidity', 'AvgT', 'heat_index']]
    y = data['heat_wave']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest Classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Check the distribution of the predictions
    print(pd.Series(y_pred).value_counts())
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    
    # Save the trained model
    joblib.dump(model, 'heat_wave_model.pkl')
    print("Model saved as 'heat_wave_model.pkl'")
else:
    print("One or more required columns ('MaxT', 'MinT', 'WindSpeed', 'Humidity') are missing in the dataset.")


Index(['MaxT', 'MinT', 'WindSpeed', 'Humidity'], dtype='object')
Scaler saved as 'scaler.pkl'
count    6236.000000
mean        0.448121
std         0.153598
min         0.000000
25%         0.329815
50%         0.448549
75%         0.548813
max         1.000000
Name: AvgT, dtype: float64
heat_wave
0    6055
1     181
Name: count, dtype: int64
0    1215
1      33
Name: count, dtype: int64
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Model saved as 'heat_wave_model.pkl'
