In [24]:
# Step 1: Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Load and Preprocess Dataset
#training_file = "D:/Data Science and Data Engineering/Semester 1/Period 2/Statistical Machine Learning/SML_Project/training_data_fall2024.csv"
training_file = "training_data_fall2024.csv"
data = pd.read_csv(training_file)

# Encode the target variable
label_encoder = LabelEncoder()
data['increase_stock'] = label_encoder.fit_transform(data['increase_stock'])

# Step 3: Split Data into Features and Target
X = data.drop(columns=['increase_stock'])
y = data['increase_stock']

# Step 4: Split Dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Step 5: Train Random Forest Model with Class Weights
rf_classifier = RandomForestClassifier(
    class_weight='balanced',  
    random_state=0
)

# Train the model
rf_classifier.fit(X_train, y_train)

# Step 6: Evaluate Model Predictions
y_pred = rf_classifier.predict(X_test)

# Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=['Decrease', 'Increase'])
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

# Step 7: Feature Importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("Feature Importances:\n", feature_importances)

# Step 8: Hyperparameter Tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=0),
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=0
)
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)

# Step 9: Cross-Validation with New Features
X['temp_dew_diff'] = X['temp'] - X['dew']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

predefined_rf = RandomForestClassifier(
    n_estimators=random_search.best_params_['n_estimators'],
    max_depth=random_search.best_params_['max_depth'],
    min_samples_split=random_search.best_params_['min_samples_split'],
    min_samples_leaf=random_search.best_params_['min_samples_leaf'],
    random_state=0
)

cv_scores = cross_val_score(predefined_rf, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Mean Accuracy:", np.mean(cv_scores))
print("Cross-Validation Std Dev:", np.std(cv_scores))

Accuracy: 0.86875
Classification Report:
               precision    recall  f1-score   support

    Decrease       0.69      0.50      0.58        58
    Increase       0.90      0.95      0.92       262

    accuracy                           0.87       320
   macro avg       0.79      0.73      0.75       320
weighted avg       0.86      0.87      0.86       320

Confusion Matrix:
 [[ 29  29]
 [ 13 249]]
Feature Importances:
         Feature  Importance
0   hour_of_day    0.278694
6          temp    0.166795
8      humidity    0.149760
7           dew    0.085329
12    windspeed    0.077089
13   cloudcover    0.068939
2         month    0.047427
1   day_of_week    0.040369
5    summertime    0.030912
14   visibility    0.027654
9        precip    0.011455
4       weekday    0.010688
3       holiday    0.003506
11    snowdepth    0.001383
10         snow    0.000000
Best Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None}
Best Cross-Vali

In [25]:
# Step 10: Model Predictions
import os
import numpy as np
import pandas as pd

# Load and preprocess test_data
#test_file_path = "D:/Data Science and Data Engineering/Semester 1/Period 2/Statistical Machine Learning/SML_Project/test_data_fall2024.csv"
test_file = "test_data_fall2024.csv"
test_data = pd.read_csv(test_file)

# Debug: Print the shape and columns of test_data
print("Initial test_data shape:", test_data.shape)
print("Initial test_data columns:", test_data.columns)

# Add missing columns to test_data and align with X_train
for col in X_train.columns:
    if col not in test_data.columns:
        test_data[col] = 0  

# Align test_data columns with X_train
test_data = test_data[X_train.columns]

# Debug: Verify the alignment of test_data
print("Aligned test_data shape:", test_data.shape)
print("Aligned test_data columns:", test_data.columns)

# Train the model if not already trained
assert hasattr(predefined_rf, 'fit'), "predefined_rf is not initialized!"
predefined_rf.fit(X_train, y_train)

# Generate predictions using the aligned test_data
final_predictions = predefined_rf.predict(test_data[:400])
print("Generated predictions:", final_predictions[:10])  

# Compare predictions with the provided sequence
provided_sequence = [...]  
mismatches = [
    (index, provided, predicted)
    for index, (provided, predicted) in enumerate(zip(provided_sequence, final_predictions))
    if provided != predicted
]
print(f"Number of mismatches: {len(mismatches)}")
if len(mismatches) > 0:
    print("First 10 mismatches:", mismatches[:10])

# Save predictions as a single row of comma-separated values
output_path = "predictions.csv"
#output_path = "D:/Data Science and Data Engineering/Semester 1/Period 2/Statistical Machine Learning/SML_Project/predictions.csv"
with open(output_path, 'w') as f:
    f.write(','.join(map(str, final_predictions)))
print(f"Predictions saved to '{output_path}'")

Initial test_data shape: (400, 15)
Initial test_data columns: Index(['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday',
       'summertime', 'temp', 'dew', 'humidity', 'precip', 'snow', 'snowdepth',
       'windspeed', 'cloudcover', 'visibility'],
      dtype='object')
Aligned test_data shape: (400, 16)
Aligned test_data columns: Index(['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday',
       'summertime', 'temp', 'dew', 'humidity', 'precip', 'snow', 'snowdepth',
       'windspeed', 'cloudcover', 'visibility', 'temp_dew_diff'],
      dtype='object')
Generated predictions: [1 1 1 1 1 1 1 1 1 1]
Number of mismatches: 1
First 10 mismatches: [(0, Ellipsis, 1)]
Predictions saved to 'predictions.csv'
