Preparation

In [2]:
print('test')

test


In [None]:
# The order of importing libraries are according to the sequence of steps taken in our work.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split


from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
# Load dataset
df = pd.read_excel("A1_Data.xlsx")

In [None]:
df.shape

In [None]:
# Get the data types of each column
column_types = df.dtypes

# Print the data types
print(column_types)

In [None]:
def identify_data(df):
    # Check for missing values
    print('Missing values in the DataFrame:\n', df.isnull().sum(),'\n')
    # Check for duplicates
    print('Number of duplicated rows in the DataFrame:\n', df.duplicated().sum(),'\n')
    # Check for unique values
    print('Number of unique values in each column of the DataFrame:\n', df.nunique(),'\n')
    # Check for out of bound value
    stats=df.describe()
    print(stats)
    
    
    # Set the figure size to make the plot wider
    plt.figure(figsize=(16, 6))
    #Create a boxplot for each numeric column
    ax = df.boxplot()
      
    # Set the rotation angle for x-axis labels
    plt.xticks(rotation=45)  # You can adjust the rotation angle as needed

   # Show the plot
    plt.show()
identify_data(df)

In [None]:
# Extract the month from the 'Date' column
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month

# Columns for imputation
numerical_columns_to_impute = ['Minimum_Temperature', 'Maximum_Temperature', 'Wind_Speed_AM', 'Wind_Speed_PM', 
                               'Rainfall_Amount', 'Max_WindGust_Speed']
categorical_columns_to_impute = ['Wind_Direction_AM', 'Wind_Direction_PM', 'Max_WindGust_Direction', 'Climate', 'Season', 
                                 'Significant_Rainfall']

# Impute numerical columns with monthly median
median_imputer = SimpleImputer(strategy='median')
for col in numerical_columns_to_impute:
    df[col] = df.groupby('Month')[col].transform(lambda x: median_imputer.fit_transform(x.values.reshape(-1, 1)).flatten())

# Impute categorical columns with monthly mode
mode_imputer = SimpleImputer(strategy='most_frequent')
for col in categorical_columns_to_impute:
    df[col] = df.groupby('Month')[col].transform(lambda x: mode_imputer.fit_transform(x.values.reshape(-1, 1)).flatten())

# Drop the 'Month' column not needed anymore
df.drop('Month', axis=1, inplace=True)

Further Data Preparation for Modelling

In [None]:
# List of categorical columns to one-hot encode
categorical_columns = ['City_Name', 'City_State', 'Climate', 'Season', 'Significant_Rainfall',
                       'Wind_Direction_AM', 'Wind_Direction_PM', 'Max_WindGust_Direction']

# Apply one-hot encoding to categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Print the first few rows of the encoded DataFrame
print(df_encoded.head())


In [None]:

# List of categorical columns to one-hot encode
categorical_columns = ['City_Name', 'City_State', 'Climate', 'Season', 'Significant_Rainfall',
                       'Wind_Direction_AM', 'Wind_Direction_PM', 'Max_WindGust_Direction']

# Apply one-hot encoding to categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Print the first few rows of the encoded DataFrame
print(df_encoded.head())


In [None]:
# Separate features and target variable
X = df_encoded.drop('Burn_Tomorrow', axis=1)
y = df_encoded['Burn_Tomorrow']


Class Imbalance Handling

In [None]:
# Step 1: Feature Scaling
scaler = StandardScaler()
# Exclude the 'Date' column from X
X_scaled = scaler.fit_transform(X.drop('Date', axis=1))

# Import the SMOTE class
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the features and target variables
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Print the shape of the resampled data
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Print the shape of train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Feature Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Assuming X_train and y_train are your training data
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Rank features by importance
feature_ranking = sorted(range(len(feature_importances)), key=lambda i: feature_importances[i], reverse=True)

# Select top N features (e.g., top 10)
top_n_features = feature_ranking[:10]

# Create a new dataset with the selected features
X_train_selected = X_train[:, top_n_features]
X_test_selected = X_test[:, top_n_features]

# Print the shape of train_selected and test_selected sets
print("Shape of X_train_selected:", X_train_selected.shape)
print("Shape of X_test_selected:", X_test_selected.shape)



In [None]:
# List of feature names in the original dataset
feature_names = X.columns.tolist()

# Get the names of the top selected features
selected_feature_names = [feature_names[i] for i in top_n_features]

# Print the names of the selected features
print('selected_feature_names:\n', selected_feature_names)


In [None]:
Training and Evaluation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Create a Decision Tree Classifier
decision_tree_classifier = DecisionTreeClassifier(random_state=42)

# Step 2: Train the Decision Tree Classifier on the selected features
decision_tree_classifier.fit(X_train_selected, y_train)

# Step 3: Perform cross-validation on the Decision Tree Classifier
scores = cross_val_score(decision_tree_classifier, X_train_selected, y_train, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", scores)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = scores.mean()
std_score = scores.std()

# Print the mean accuracy and standard deviation
print("Mean accuracy:", mean_score)
print("Standard deviation:", std_score)

# Step 4: Make Predictions
y_pred_dt = decision_tree_classifier.predict(X_test_selected)

# Step 5: Evaluate the Decision Tree Classifier
accuracy = accuracy_score(y_test, y_pred_dt)
classification_report_str = classification_report(y_test, y_pred_dt)

# Print the accuracy and classification report
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report_str)


In [None]:
confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix:")
print(confusion_matrix_dt)

In [1]:
print('test')

test
