# Forest Fires Prediction

## Import libraries

In [None]:
import pandas as pd
import numpy as np
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestCentroid
from sklearn import svm
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

## Imports from PyTorch

In [None]:
# Refer README.txt for installation steps
import torch
import torch.nn as nn

## Reading train and test data

In [None]:
train=pd.read_csv("algerian_fires_train.csv")
test=pd.read_csv("algerian_fires_test.csv")

## Looking at the distribution of dataset

In [None]:
def class_distribution(df):
    '''
    Function to generate data distribution as a pie chart
    
    Parameters:
    df: input data frame
    
    Return:
    Pie chart showing the distribution of the classes
    '''
    class_list=[len(df[df.iloc[:,-1]==1]),len(df[df.iloc[:,-1]==0])]
    col = sns.color_palette('Set2')
    labels = ['Class Fire', 'Class No Fire']
    plt.figure(figsize=(5,5))
    plt.pie(class_list, labels=labels,colors =col , autopct = '%0.0f%%')
    plt.show()

In [None]:
class_distribution(train)

## Data Preprocessing

In [None]:
def create_features(df):
    '''
    Function to create features from the date column of the data frame
    
    Parameters:
    df: input data frame for which extra features need to be created

    Return:
    df: updated data frame
    '''

    df['Date']=pd.to_datetime(df['Date'],format="%d/%m/%Y")# convert to pandas datetime format
    df['Month']=df['Date'].dt.month# extract month
    df['Day']=df['Date'].dt.day_name()# extract date
    df=pd.get_dummies(df,columns=['Day'],prefix="",prefix_sep="")# one hot encoding for the days
    class_col=df.pop('Classes')
    df.insert(len(df.columns), "Classes", class_col )
    return df

In [None]:
train=create_features(train)

In [None]:
test=create_features(test)

In [None]:
def moving_avg(df):
    '''
    Function to create a column with moving average of temperature from the temperature column of the data frame
    
    Parameters:
    df: input data frame

    Return:
    df: updated data frame
    '''
    df2=df.copy(deep=True)
    df2['Rolling_Temp'] = df2['Temperature'].rolling(10).mean()
    df2.Rolling_Temp.fillna(df2['Temperature'].iloc[0:10],inplace=True)
    
    for i in range(8, len(df)-1, 2):
        df2.loc[i, 'Rolling_Temp'] = df2.loc[i+1, 'Rolling_Temp']
    roll_temp=df2.pop('Rolling_Temp')
    df2.insert(2, "Rolling_Temp", roll_temp )
    return df2

In [None]:
train_mov=moving_avg(train)

In [None]:
test_mov=moving_avg(test)

## Feature Engineering and Dimensionality Adjustment

### Pearson correlation coefficient

In [None]:
def pearson(df):
    '''
    Function to create a heatmap showing the correlation between all the features by using the Pearson correlation 
    
    Parameters:
    df: input dataframe

    Return:
    Heatmap showing the correlation between all the features
    '''
    plt.figure(figsize=(14,14))
    cor = df.corr(method='pearson')
    ax = plt.axes()
    sns.heatmap(cor, annot=True, cmap="YlGnBu",ax=ax)
    plt.show()

In [None]:
pearson(train_mov.iloc[:,1:-1])# generate the heatmap for all the features except the dates

### PCA

In [None]:
def pca_analysis(df):
    '''
    Function to perform feature analysis using PCA
    
    Parameters:
    df:input data frame

    Return:
    Plot of feature analysis using PCA
    '''
    pca = PCA()
    pca.fit(df)
    n_components = np.arange(pca.n_components_) + 1
    plt.figure(figsize=(6,6))
    plt.plot(n_components, pca.explained_variance_ratio_.cumsum(), 'o-', linewidth=2, color='green')
    plt.xlabel('Features (principal components)')
    plt.ylabel('Total Variance')

In [None]:
pca_analysis(train_mov.iloc[:,1:-1])

In [None]:
def drop_columns(df):
    '''
    Function to drop the columns after the analysis from dimensionality reduction
    
    Parameters:
    df: input data frame

    Return:
    data frame after dropping columns
    '''
    df.drop(['Date','RH','Rain','Ws','Friday','Monday','Saturday','Sunday','Thursday','Tuesday','Wednesday'],axis=1,inplace=True)
    return df

In [None]:
train_mov=drop_columns(train_mov)

In [None]:
test_mov=drop_columns(test_mov)

### Standardization

In [None]:
def standardize(train_data,test_data):
    scaler = StandardScaler()
    train_data= scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    return train_data,test_data

In [None]:
train_mov.head()

In [None]:
train_mov.iloc[:,0:-2],test_mov.iloc[:,0:-2]=standardize(train_mov.iloc[:,0:-2],test_mov.iloc[:,0:-2])

#### Eliminating the last 8 points from the train data to avoid past data influencing future data

In [None]:
train_mov = train_mov.loc[0:train_mov.shape[0]-9,:] 

## Plot for distribution of the features

In [None]:
def plot_feature_distribution(df):
    '''
    Function to create subplots of the distribution of the features
    
    Parameters:
    df: input data frame

    Return:
    plots of the distribution of each feature
    '''
    fig, axes = plt.subplots(2, 4, figsize=(22, 12))

    sns.histplot(df['Temperature'],ax=axes[0, 0],color='purple', kde=True, stat="density", linewidth=0)
    sns.histplot(df['Rolling_Temp'],ax=axes[0, 1],color='purple', kde=True, stat="density", linewidth=0)
    sns.histplot(df['FFMC'],ax=axes[0, 2],color='purple', kde=True, stat="density", linewidth=0)
    sns.histplot(df['DMC'],ax=axes[0, 3],color='purple', kde=True, stat="density", linewidth=0)

    sns.histplot(df['DC'],ax=axes[1, 0],color='purple', kde=True, stat="density", linewidth=0)
    sns.histplot(df['ISI'],ax=axes[1, 1],color='purple', kde=True, stat="density", linewidth=0).set_xlim(-2,3.5)
    sns.histplot(df['BUI'],ax=axes[1, 2],color='purple', kde=True, stat="density", linewidth=0)
    sns.histplot(df['Month'],ax=axes[1, 3],color='purple', linewidth=0)
    plt.show()

In [None]:
plot_feature_distribution(train_mov)


#### Convert train and test to numpy arrays

In [None]:
train_mov_np=train_mov.to_numpy()
test_mov_np=test_mov.to_numpy()

In [None]:
def performance_measure(output, target):
    '''Function to compute the accuracy and F1 score for a given dataset
    
    Parameters:
    output: Output class labels after model fitting
    target: Actual class labels in the dataset
    
    Return 
    Computed accuracy and F1 scores
    '''
    
    accuracy = accuracy_score(target, output)
    f1_value = f1_score(target, output)
    return accuracy, f1_value

In [None]:
def plot_confusion_matrix(target, output):
    '''
    Function to create a confusion matrix
    
    Parameters:
    df: input data frame

    Return:
    plots of the distribution of each feature
    '''
    matrix=confusion_matrix(target, output)
    ax=sns.heatmap(matrix, annot=True, cmap="PiYG")
    ax.set(ylabel="True Label", xlabel="Predicted Label")
    plt.show()

In [None]:
def round_off_values(x):
    return round(x,3)

In [None]:
def print_acc_val(train_acc,train_f1,val_acc,val_f1):
    print("Cross validation train accuracy is", round_off_values(sum(train_acc)/len(train_acc)))
    print("Cross validation train F1 is", round_off_values(sum(train_f1)/len(train_f1)))
    print("Cross validation validation accuracy is", round_off_values(sum(val_acc)/len(val_acc)))
    print("Cross validation validation F1 is", round_off_values(sum(val_f1)/len(val_f1)))

## Trivial Model

In [None]:
def trivial_model_train(dataset, total_batches):
    '''
    Function to train the trivial model on the train dataset
    
    Parameters:
    dataset: Dataset to train on
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    p1: Probability of generating class label of S1
    p2: Probability of generating class label of S2
    '''
    c = total_batches-1
    size = int(len(dataset)/total_batches)
    
    #Dividing the dataset into train and val with an 80/20 split
    val_data = dataset[c*size:(c+1)*size,:] 
    train_data = np.delete(dataset,slice(c*size,(c+1)*size,1),axis=0)
    train_data=train_data[0:len(train_data)-8] #deleting past data that could infuence performance on val data

    X_train=train_data[:,0:-1]
    y_train=train_data[:,-1]
    len_train_data = len(y_train)
    
    X_val=val_data[:,0:-1]
    y_val=val_data[:,-1]
    len_val_data = len(y_val)
                                                    
    N1=(y_train == 0).sum() #total number of points belonging to class S1
    N2=(y_train == 1).sum() #total number of points belonging to class S2
    N=N1+N2
    p1=N1/N #probability of point belonging to class S1
    p2=N2/N #probability of point belonging to class S2
    
    y_pred = np.random.binomial(1, p1, len_train_data) #Generating class labels with appropriate probabilities
    train_accuracy, train_f1 = performance_measure(y_pred, y_train)
        
    print("Train accuracy is", round_off_values(train_accuracy))
    print("Train F1 score is", round_off_values(train_f1))
    
    y_pred = np.random.binomial(1, p1, len_val_data) #Generating class labels with appropriate probabilities for the validation dataset 
    val_accuracy, val_f1 = performance_measure(y_pred, y_val)
    
    print("Validation accuracy is", round_off_values(val_accuracy))
    print("Validation F1 score is",round_off_values( val_f1))
    return p1,p2

In [None]:
def trivial_model_test(p1, y):
    '''
    Function to test the trivial model on the test dataset
    
    Parameters:
    p1: Probability of generating class label of S1
    y: Target class labels in the test dataset
    
    Return:
    none
    '''
    
    y=y.astype(int)
    len_test_data=y.shape[0]
    
    y_pred=np.random.binomial(1,p1,len_test_data) #Generating class labels with appropriate probabilities for the test dataset

    test_accuracy, test_f1 = performance_measure(y_pred, y)
    print("Test accuracy is", round_off_values(test_accuracy))
    print("Test F1 score is", round_off_values(test_f1))
    plot_confusion_matrix(y, y_pred)

In [None]:
#Executing the trivial model
p1, p2 = trivial_model_train(train_mov_np, 6)
trivial_model_test(p1, test_mov_np[:,-1])

## Baseline model

In [None]:
def nearest_means_classifier_model_train(dataset, total_batches):
    '''Function to train the baseline model on the train dataset
    
    Parameters:
    dataset: Dataset to train on
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    baseline_model: Model fitted onto the train data
    '''
    
    c = total_batches-1
    size = int(len(dataset)/total_batches)
    
    #Dividing the dataset into train and val with an 80/20 split
    val_data=dataset[c*size:(c+1)*size,:] 
    train_data = np.delete(dataset,slice(c*size,(c+1)*size,1),axis=0)
    train_data=train_data[0:len(train_data)-8]

    X_train=train_data[:,0:-1]
    y_train=train_data[:,-1]
    
    X_val=val_data[:,0:-1]
    y_val=val_data[:,-1]
    
    baseline_model = NearestCentroid() 
    
    baseline_model.fit(X_train,y_train) #fitting the Nearest means model on train data
    y_pred = baseline_model.predict(X_train) #predicting accuracy on train data
    
    train_accuracy, train_f1 = performance_measure(y_pred, y_train)
    print("Train accuracy is", round_off_values(train_accuracy))
    print("Train F1 score is", round_off_values(train_f1))
    
    y_pred = baseline_model.predict(X_val) #predicting accuracy on val data
    
    val_accuracy, val_f1 = performance_measure(y_pred, y_val)
    print("Validation accuracy is", round_off_values(val_accuracy))
    print("Validation F1 score is", round_off_values(val_f1))
    
    return baseline_model

In [None]:
def nearest_means_classifier_model_test(baseline_model, test_data):
    '''Function to test the baseline model on the test dataset
    
    Parameters:
    baseline_model: Model fitted onto the train data
    test_data: Dataset to test on
    
    Return:
    F1 Score, Accuracy and Confusion Matrix for Test set
    '''
    
    test = test_data[:,0:-1]
    y_test = test_data[:,-1].astype(int)
    
    y_pred = baseline_model.predict(test) #predicting accuracy on test data
    
    test_accuracy, test_f1 = performance_measure(y_pred, y_test)
    print("Test Accuracy is", round_off_values(test_accuracy))
    print("Test F1 score is", round_off_values(test_f1))
    print('Confusion Matrix for Test Set')
    plot_confusion_matrix(y_test, y_pred)

In [None]:
#Executing the baseline model
baseline_train_model = nearest_means_classifier_model_train(train_mov_np, 6)
nearest_means_classifier_model_test(baseline_train_model, test_mov_np)

## Cross Validation

In [None]:
def cross_validation(c, dataset, batch_size, flag, weights):
    '''
    Function to divide the train dataset into batches to perform cross validation
    
    Parameters:
    c: Count of the batch number
    dataset: Entire train dataset
    batch_size: Total number of partitions required
    flag : Used to indicate whether weights should be used
    weights: Optional parameter used for weighted Naive Bayes
    
    Return:
    train_data: Train data partition
    val_data: Val data partition
    '''
    
    N=len(dataset)
    i=int(batch_size)
    val_data=dataset[c*i:(c+1)*i,:] #Selecting the validation data partition
                    
    if c==0:  
        val_data = val_data[c*i:len(val_data)-8]
        train_data = np.delete(dataset,slice(c*i,(c+1)*i,1),axis=0) #Eliminating val data to obtain train data
        if(flag == True):
            weight_data = np.delete(weights,slice(c*i,(c+1)*i,1),axis=0)
        
    elif c==i-1:
        train_data = np.delete(dataset,slice(c*i,(c+1)*i,1),axis=0)
        train_data=train_data[0:len(train_data)-8]     
        if(flag == True):
            weight_data = np.delete(weights,slice(c*i,(c+1)*i,1),axis=0)
            weight_data = weight_data[0:len(weight_data)-8]
            
    else:
        val_data = val_data[0:len(val_data)-8] #Eliminating the data points which may affect future datapoints
        data_1=dataset[0:(c*i)-8,:] 
        data_2=dataset[(c+1)*i:N,:] 
        train_data = np.concatenate((data_1,data_2),axis=0)   
        if(flag == True):
            wdata_1 = weights[0:(c*i)-8]
            wdata_2 = weights[(c+1)*i:N]
            weight_data = np.concatenate((wdata_1, wdata_2), axis=0)
            
    if(flag == False):
        return train_data, val_data
    else:
        return train_data, val_data, weight_data

In [None]:
def model_test(model, test_data):
    '''
    Function to test the ML model on the test dataset
    
    Parameters:
    model: Model fitted onto the train data
    test_data: Dataset to test on
    
    Return:
    Confusion Matrix for test data
    '''
    test = test_data[:,0:-1]
    y_test = test_data[:,-1].astype(int)
    
    y_pred = model.predict(test) #predicting accuracy on test data
    
    test_accuracy, test_f1 = performance_measure(y_pred, y_test)
    print("Test accuracy is", round_off_values(test_accuracy)) 
    print("Test f1 score is", round_off_values(test_f1))
    plot_confusion_matrix(y_test, y_pred)

## SVM

In [None]:
def hyperparameters_opt_svm(X_train, y_train, X_val, y_val,range_C, range_kernel, range_gamma):
    '''
    Function to determine the optimal hyperparameters for the SVM kernel for the given partition
    
    Parameters:
    X_train: Features of train dataset from the given partition
    y_train: Class labels of train dataset from the given partition
    X_val: Features of validation dataset from the given partition
    y_val: Class labels of validation dataset from the given partition
    range_C: List of values of the regularization parameter
    range_kernel: List of values of the kernel parameter
    range_gamma: List of values of the gamma parameter
    
    Return:
    score_list: F1 scores for each hyperparameter combination
    train_accuracy: Accuracy on train dataset for the given partition
    train_f1: F1 score on train dataset for the given partition
    val_accuracy: Accuracy on validation dataset for the given partition
    val_f1: F1 score on validation dataset for the given partition
    '''
    
    X_train=X_train.astype(float)
    y_train=y_train.astype(float)
    X_val=X_val.astype(float)
    y_val=y_val.astype(float)

    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f_1 = []
    val_acc  =[]
    val_f_1 = []
    
    score_list=np.zeros((len(range_C),len(range_kernel),len(range_gamma))) #matrix used to store F1 score across hyperparameters
    
    for i, C in enumerate(range_C):
        for j, algo in enumerate(range_kernel):
            for k, gamma in enumerate(range_gamma):

                model = svm.SVC(kernel = algo, C = C, gamma = gamma, class_weight = 'balanced')
                model.fit(X_train, y_train) #fitting the SVM model on train data
                y_pred = model.predict(X_train) #predicting accuracy on train data
                
                train_accuracy, train_f1 = performance_measure(y_pred, y_train)
                train_acc.append(train_accuracy)
                train_f_1.append(train_f1)
                
                y_pred=model.predict(X_val) #predicting accuracy on val data
    
                val_accuracy, val_f1 = performance_measure(y_pred, y_val)
                val_acc.append(val_accuracy)
                val_f_1.append(val_f1)
                
                f1 = f1_score(y_val, y_pred)
                score_list[i][j][k] = f1 #Storing the F1 score for the specific hyperparameter combination

    #Obtaining the mean performance metrics for the data partition
    train_accuracy = sum(train_acc)/len(train_acc)
    train_f1 = sum(train_f_1)/len(train_f_1)
    val_accuracy = sum(val_acc)/len(val_acc)
    val_f1 = sum(val_f_1)/len(val_f_1)
    return score_list, train_accuracy, train_f1, val_accuracy, val_f1

In [None]:
def svm_model_train(dataset, total_batches, weights = 0):
    '''
    Function to train the SVM model on the train dataset
    
    Parameters:
    dataset: input dataset for training
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    optimal_model: SVM Model fitted onto the train data
    '''
    
    count = 0
    
    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f1 = []
    val_acc = []
    val_f1 = []
    
    #Range of values of hyperparameters for SVM
    range_C=np.array([0.01, 0.5, 0.1, 1, 2, 5, 10, 50, 100, 500, 1000])
    range_kernel = np.array(['linear', 'poly','rbf', 'sigmoid'])
    range_gamma=np.array([0.001, 0.1, 0.2, 2, 10])

    score_matrix = np.zeros((total_batches,len(range_C),len(range_kernel), len(range_gamma))) #matrix used to store F1 score across hyperparameters and batches
    final_score = np.zeros((len(range_C), len(range_kernel), len(range_gamma))) #matrix used to store F1 score across hyperparameters by averaging across batches
                   
    while count<total_batches:
        n=len(dataset)/total_batches
        train_data, val_data = cross_validation(count, dataset, n, False, 0)
        
        
        X_train=train_data[:,0:-1]
        y_train=train_data[:,-1]
        X_val=val_data[:,0:-1]
        y_val=val_data[:,-1]
        
        score_matrix[count], t_acc, t_f1, v_acc, v_f1 = hyperparameters_opt_svm(X_train, y_train, X_val, y_val, range_C, range_kernel, range_gamma)
        
        train_acc.append(t_acc)
        train_f1.append(t_f1)
        val_acc.append(v_acc)
        val_f1.append(v_f1)
        
        count+=1
  
    for i in range(len(range_C)):
        for j in range(len(range_kernel)):
            for k in range(len(range_gamma)):
                final_score[i,j,k]=np.mean(score_matrix[:,i,j,k]) #Calculating the average across data partitions

    print_acc_val(train_acc,train_f1,val_acc,val_f1)
    
    max_index = np.unravel_index(final_score.argmax(), final_score.shape)
    optimal_C = range_C[max_index[0]]
    optimal_kernel = range_kernel[max_index[1]]
    optimal_gamma = range_gamma[max_index[2]]
    
    print("optimal_C:",optimal_C)
    print("optimal kernel:",optimal_kernel)
    print("optimal_gamma:",optimal_gamma)
    
    optimal_model = svm.SVC(kernel=optimal_kernel, C=optimal_C, gamma=optimal_gamma, class_weight = 'balanced')
    
    training_dataset=dataset[:,0:-1]
    training_dataset=training_dataset.astype(float)
    class_label=dataset[:,-1]
    class_label=class_label.astype(float)
    
    optimal_model.fit(training_dataset,class_label)
    y_pred = optimal_model.predict(training_dataset) #predicting accuracy on entire train data
    final_train_accuracy, final_train_f1 = performance_measure(y_pred, class_label)
    print("Final Train accuracy is", round_off_values(final_train_accuracy))
    print("Final Train f1 is", round_off_values(final_train_f1))

    return optimal_model

In [None]:
#Executing the SVM model

optimal_svm_model=svm_model_train(train_mov_np, 6)
model_test(optimal_svm_model,test_mov_np)

## Naive Bayes

In [None]:
def hyperparameters_opt_NB(X_train, y_train, X_val, y_val, range_alpha, weight):
    '''
    Function to determine the optimal hyperparameters for the Naive Bayes model for the given partition
    
    Parameters:
    X_train: Features of train dataset from the given partition
    y_train: Class labels of train dataset from the given partition
    X_val: Features of validation dataset from the given partition
    y_val: Class labels of validation dataset from the given partition
    range_alpha: List of values of the regularization parameter
    weight: List of weights for each sample
    
    Return:
    score_list: F1 scores for each hyperparameter combination
    train_accuracy: Accuracy on train dataset for the given partition
    train_f1: F1 score on train dataset for the given partition
    val_accuracy: Accuracy on validation dataset for the given partition
    val_f1: F1 score on validation dataset for the given partition
    '''
    
    X_train=X_train.astype(float)
    y_train=y_train.astype(float)
    X_val=X_val.astype(float)
    y_val=y_val.astype(float)
    
    score_list=np.zeros(len(range_alpha)) #matrix used to store F1 score across hyperparameters
    
    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f_1 = []
    val_acc  =[]
    val_f_1 = []
    
    for i, alpha_val in enumerate(range_alpha):
        
        model = ComplementNB(alpha = alpha_val)
        model.fit(np.absolute(X_train), y_train, weight) #fitting the Naive Bayes model on train data 
        y_pred = model.predict(X_train) #predicting accuracy on train data
        
        train_accuracy, train_f1 = performance_measure(y_pred, y_train)
        train_acc.append(train_accuracy)
        train_f_1.append(train_f1)

        y_pred = model.predict(X_val) #predicting accuracy on val data
        val_accuracy, val_f1 = performance_measure(y_pred, y_val)
        val_acc.append(val_accuracy)
        val_f_1.append(val_f1)
        
        f1 = f1_score(y_val, y_pred) 
        score_list[i] = f1 #Storing the F1 score for the specific hyperparameter combination

    #Obtaining the mean performance metrics for the data partition
    train_accuracy = sum(train_acc)/len(train_acc)
    train_f1 = sum(train_f_1)/len(train_f_1)
    val_accuracy = sum(val_acc)/len(val_acc)
    val_f1 = sum(val_f_1)/len(val_f_1)
    
    return score_list, train_accuracy, train_f1, val_accuracy, val_f1

In [None]:
def NB_model_train(dataset, total_batches):
    '''
    Function to train the Naive Bayes model on the train dataset
    
    Parameters:
    dataset: Dataset to train on
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    optimal_model: Naive Bayes Model fitted onto the train data
    '''
    
    count=0
    
    #Weights that are inversely proportional to the class size
    weights = np.zeros(len(dataset))
    weights[train_mov_np[:, -1] == 0] = 1/69
    weights[train_mov_np[:, -1] == 1] = 1/115
    
    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f1 = []
    val_acc = []
    val_f1 = []
    
    #Range of values of hyperparameters for Naive Bayes
    range_alpha= np.array([0.005, 0.01, 0.1, 0.5, 1.0, 10.0])

    score_matrix = np.zeros((total_batches,len(range_alpha))) #matrix used to store F1 score across hyperparameters and batches
    final_score = np.zeros(len(range_alpha)) #matrix used to store F1 score across hyperparameters by averaging across batches
    
    while count<total_batches:
        n=len(dataset)/total_batches
        train_data, val_data, w = cross_validation(count, dataset, n, True, weights)
        
        X_train=train_data[:,0:-1]
        y_train=train_data[:,-1]
        X_val=val_data[:,0:-1]
        y_val=val_data[:,-1]
        
        score_matrix[count], t_acc, t_f1, v_acc, v_f1 = hyperparameters_opt_NB(X_train, y_train, X_val, y_val, range_alpha, w)

        train_acc.append(t_acc)
        train_f1.append(t_f1)
        val_acc.append(v_acc)
        val_f1.append(v_f1)
        
        count+=1
    
    for i in range(len(range_alpha)):
        final_score[i] = np.mean(score_matrix[:,i]) #Calculating the average across data partitions
        
    print_acc_val(train_acc,train_f1,val_acc,val_f1)
    
            
    max_index=np.unravel_index(final_score.argmax(), final_score.shape)
    optimal_alpha=range_alpha[max_index[0]]

    print("Optimal Alpha is", optimal_alpha)
  
    optimal_model = ComplementNB(alpha = optimal_alpha)

    training_dataset=dataset[:,0:-1]
    training_dataset=training_dataset.astype(float)
    class_label=dataset[:,-1]
    class_label=class_label.astype(float)
    
    optimal_model.fit(np.absolute(training_dataset), class_label, weights)
    y_pred = optimal_model.predict(training_dataset) #predicting accuracy on entire train data
    final_train_accuracy, final_train_f1 = performance_measure(y_pred, class_label)
    print("Final Train Accuracy is", round_off_values(final_train_accuracy))
    print("Final Train F1 Score is", round_off_values(final_train_f1))

    return optimal_model

In [None]:
#Executing the Naive Bayes model

optimal_nb_model = NB_model_train(train_mov_np,6)        
model_test(optimal_nb_model,test_mov_np)

## Logistic Regression

In [None]:
def hyperparameters_opt_log_reg(X_train, y_train, X_val, y_val, range_C, range_solver):
    '''
    Function to determine the optimal hyperparameters for the Logistic Regression model for the given partition
    
    Parameters:
    X_train: Features of train dataset from the given partition
    y_train: Class labels of train dataset from the given partition
    X_val: Features of validation dataset from the given partition
    y_val: Class labels of validation dataset from the given partition
    range_C: List of values of the regularization parameter
    range_solver: List of values of algorithm to use in optimization
    
    Return:
    score_list: F1 scores for each hyperparameter combination
    train_accuracy: Accuracy on train dataset for the given partition
    train_f1: F1 score on train dataset for the given partition
    val_accuracy: Accuracy on validation dataset for the given partition
    val_f1: F1 score on validation dataset for the given partition
    '''
    
    X_train=X_train.astype(float)
    y_train=y_train.astype(float)
    X_val=X_val.astype(float)
    y_val=y_val.astype(float)

    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f_1 = []
    val_acc  =[]
    val_f_1 = []
    
    score_list=np.zeros((len(range_C), len(range_solver))) #matrix used to store F1 score across hyperparameters
    
    for i,c in enumerate(range_C):
        for j,algo in enumerate(range_solver):

            model = LogisticRegression(C=c,solver=algo, max_iter=200, penalty = 'l2', class_weight = 'balanced')
            model.fit(X_train, y_train) #fitting the Logistic Regression model on train data 
            y_pred = model.predict(X_train) #predicting accuracy on train data
            
            train_accuracy, train_f1 = performance_measure(y_pred, y_train)
            train_acc.append(train_accuracy)
            train_f_1.append(train_f1)

            y_pred = model.predict(X_val) #predicting accuracy on val data
            val_accuracy, val_f1 = performance_measure(y_pred, y_val)
            val_acc.append(val_accuracy)
            val_f_1.append(val_f1)

            f1 = f1_score(y_val, y_pred) 
            score_list[i][j] = f1 #Storing the F1 score for the specific hyperparameter combination

    #Obtaining the mean performance metrics for the data partition
    train_accuracy = sum(train_acc)/len(train_acc)
    train_f1 = sum(train_f_1)/len(train_f_1)
    val_accuracy = sum(val_acc)/len(val_acc)
    val_f1 = sum(val_f_1)/len(val_f_1)
    
    return score_list, train_accuracy, train_f1, val_accuracy, val_f1

In [None]:
def log_reg_model_train(dataset,total_batches):
    '''
    Function to train the Logistic Regression model on the train dataset
    
    Parameters:
    dataset: Dataset to train on
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    optimal_model: Logistic Regression Model fitted onto the train data
    '''
    
    count=0
    
    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f1 = []
    val_acc = []
    val_f1 = []
    
    #Range of values of hyperparameters for Logistic Regression
    range_C=np.array([100, 10, 1.0, 0.1, 0.01])
    range_solver = np.array(['newton-cg','lbfgs','liblinear'])

    score_matrix = np.zeros((total_batches,len(range_C), len(range_solver))) #matrix used to store F1 score across hyperparameters and batches
    final_score = np.zeros((len(range_C),len(range_solver))) #matrix used to store F1 score across hyperparameters by averaging across batches
                   
    while count<total_batches:
        n=len(dataset)/total_batches
        train_data,val_data=cross_validation(count, dataset, n, False, 0)
        
        
        X_train=train_data[:,0:-1]
        y_train=train_data[:,-1]
        X_val=val_data[:,0:-1]
        y_val=val_data[:,-1]
        
        score_matrix[count], t_acc, t_f1, v_acc, v_f1 = hyperparameters_opt_log_reg(X_train, y_train, X_val, y_val, range_C, range_solver)
        
        train_acc.append(t_acc)
        train_f1.append(t_f1)
        val_acc.append(v_acc)
        val_f1.append(v_f1)
        
        count+=1
        
    for i in range(len(range_C)):
        for j in range(len(range_solver)):
            final_score[i][j]=np.mean(score_matrix[:,i,j]) #Calculating the average across data partitions

    print_acc_val(train_acc,train_f1,val_acc,val_f1)

    max_index = np.unravel_index(final_score.argmax(), final_score.shape)
    optimal_C = range_C[max_index[0]]
    optimal_solver = range_solver[max_index[1]]
    
    print("Optimal value of C is",optimal_C)
    print("Optimal solver is",optimal_solver)
    
    optimal_model = LogisticRegression(C=optimal_C,solver=optimal_solver, max_iter=200, penalty = 'l2', class_weight = 'balanced')
    
    training_dataset=dataset[:,0:-1]
    training_dataset=training_dataset.astype(float)
    class_label=dataset[:,-1]
    class_label=class_label.astype(float)
    
    optimal_model.fit(training_dataset,class_label )
    y_pred = optimal_model.predict(training_dataset) #predicting accuracy on entire train data
    final_train_accuracy, final_train_f1 = performance_measure(y_pred, class_label)
    print("Final Train Accuracy is", round_off_values(final_train_accuracy))
    print("Final Train F1 Score is", round_off_values(final_train_f1))
    
    return optimal_model

In [None]:
optimal_log_reg_model = log_reg_model_train(train_mov_np,7)
model_test(optimal_log_reg_model, test_mov_np)

## Decision Tree

In [None]:
def hyperparameters_opt_dec_tree(X_train, y_train, X_val, y_val, range_min_samples_leaf, range_sample_split):
    '''
    Function to determine the optimal hyperparameters for the Decision tree model for the given partition
    
    Parameters:
    X_train: Features of train dataset from the given partition
    y_train: Class labels of train dataset from the given partition
    X_val: Features of validation dataset from the given partition
    y_val: Class labels of validation dataset from the given partition
    range_min_samples_leaf: List of values of the minimum number of samples at a leaf node
    range_sample_split: List of values of minimum number of samples required to split an internal node
    
    Return:
    score_list: F1 scores for each hyperparameter combination
    train_accuracy: Accuracy on train dataset for the given partition
    train_f1: F1 score on train dataset for the given partition
    val_accuracy: Accuracy on validation dataset for the given partition
    val_f1: F1 score on validation dataset for the given partition
    '''
    
    X_train=X_train.astype(float)
    y_train=y_train.astype(float)
    X_val=X_val.astype(float)
    y_val=y_val.astype(float)

    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f_1 = []
    val_acc  =[]
    val_f_1 = []
    
    score_list=np.zeros((len(range_min_samples_leaf),len(range_sample_split))) #matrix used to store F1 score across hyperparameters
    
    for i,leaf in enumerate(range_min_samples_leaf):
        for j,sample_split in enumerate(range_sample_split):

            model = tree.DecisionTreeClassifier(min_samples_leaf=leaf, min_samples_split=sample_split, criterion='gini', class_weight = 'balanced')
            model.fit(X_train, y_train) #fitting the Logistic Regression model on train data 
            y_pred = model.predict(X_train) #predicting accuracy on train data
            
            train_accuracy, train_f1 = performance_measure(y_pred, y_train)
            train_acc.append(train_accuracy)
            train_f_1.append(train_f1)

            y_pred = model.predict(X_val) #predicting accuracy on val data
            val_accuracy, val_f1 = performance_measure(y_pred, y_val)
            val_acc.append(val_accuracy)
            val_f_1.append(val_f1)
        
            f1 = f1_score(y_val, y_pred)
            score_list[i][j] = f1 #Storing the F1 score for the specific hyperparameter combination

    #Obtaining the mean performance metrics for the data partition
    train_accuracy = sum(train_acc)/len(train_acc)
    train_f1 = sum(train_f_1)/len(train_f_1)
    val_accuracy = sum(val_acc)/len(val_acc)
    val_f1 = sum(val_f_1)/len(val_f_1)
    
    return score_list, train_accuracy, train_f1, val_accuracy, val_f1

In [None]:
def decision_tree_model_train(dataset,total_batches):
    '''
    Function to train the Decision Tree model on the train dataset
    
    Parameters:
    dataset: Dataset to train on
    total_batches: Number of batches to determine division of train/val data
    
    Return:
    optimal_model: Decision Tree Model fitted onto the train data
    '''
    
    count=0
    
    #Lists used to determine train/val performance measures during cross validation
    train_acc = []
    train_f1 = []
    val_acc = []
    val_f1 = []
    
    #Range of values of hyperparameters for Decision Tree
    range_min_samples_leaf = np.array([2, 3, 4, 5, 6, 7])
    range_sample_split = np.array([2,3,4,5,6,7, 8, 9])

    score_matrix = np.zeros((total_batches,len(range_min_samples_leaf),len(range_sample_split))) #matrix used to store F1 score across hyperparameters and batches
    final_score = np.zeros((len(range_min_samples_leaf),len(range_sample_split))) #matrix used to store F1 score across hyperparameters by averaging across batches
                   
    while count<total_batches:
        n=len(dataset)/total_batches
        train_data,val_data=cross_validation(count, dataset, n, False, 0)
                
        X_train=train_data[:,0:-1]
        y_train=train_data[:,-1]
        X_val=val_data[:,0:-1]
        y_val=val_data[:,-1]
        
        score_matrix[count], t_acc, t_f1, v_acc, v_f1=hyperparameters_opt_dec_tree(X_train, y_train, X_val, y_val, range_min_samples_leaf, range_sample_split)
        
        train_acc.append(t_acc)
        train_f1.append(t_f1)
        val_acc.append(v_acc)
        val_f1.append(v_f1)
        
        count+=1
        
    for i in range(len(range_min_samples_leaf)):
        for j in range(len(range_sample_split)):
            final_score[i,j] = np.mean(score_matrix[:,i,j]) #Calculating the average across data partitions

    print_acc_val(train_acc,train_f1,val_acc,val_f1)
    
    max_index=np.unravel_index(final_score.argmax(), final_score.shape)
    optimal_leaf = range_min_samples_leaf[max_index[0]]
    optimal_split = range_sample_split[max_index[1]]
    
    print("Optimal number of minimum samples of leaf is", optimal_leaf)
    print("Optimal number of sample splits is", optimal_split)
    
    optimal_model = tree.DecisionTreeClassifier(min_samples_leaf=optimal_leaf, min_samples_split=optimal_split, criterion='gini', class_weight = 'balanced')
   
    training_dataset=dataset[:,0:-1]
    training_dataset=training_dataset.astype(float)
    class_label=dataset[:,-1]
    class_label=class_label.astype(float)
    
    optimal_model.fit(training_dataset,class_label )
    y_pred = optimal_model.predict(training_dataset) #predicting accuracy on entire train data
    final_train_accuracy, final_train_f1 = performance_measure(y_pred, class_label)
    print("Final Train Accuracy is", round_off_values(final_train_accuracy))
    print("Final Train F1 Score is", round_off_values(final_train_f1))

    return optimal_model

In [None]:
optimal_decision_tree_model = decision_tree_model_train(train_mov_np,6)
model_test(optimal_decision_tree_model, test_mov_np)

## ANN Model

In [None]:
class ANN(nn.Module):
    '''
    Function to create features from the date column of the data frame
    '''
    def __init__(self):
        '''
        Function to generate the structure of neural network
        '''
        super(ANN, self).__init__()
        self.net=nn.Sequential(# using Sequential container

        nn.Linear(8, 6),# input layer
        nn.ReLU(),# activation
        nn.Linear(6, 3),# hidden layer 1
        nn.ReLU(),# activation
        nn.Linear(3, 1),# hidden layer 2
        nn.Sigmoid()# output layer
        )

    def forward(self,x):
        '''
        Function to generate the output of the neural network
        Parameters:
        x: input

        Return:
        output: output of the neural network
        '''
        output= self.net(x)
        return output

In [None]:
def classification_model(learning_rate,wt_decay):
    '''
    Function to create a column with moving average of temperature from the temperature column of the data frame
    
    Parameters:
    df: input data frame
    learning_rate: learning rate
    wt_decay: weight decay
    
    Return:
    model: created model
    optimizer: Adam optimizer
    '''
    model = ANN()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=wt_decay)
    return model, optimizer

In [None]:
def convert_data(x):
    '''
    Function to create a tensor from the input numpy array
    Parameters:
    x: input numpy array

    Return:
    tensor form of the input numpy array 
    '''
    return torch.from_numpy(x).float()

In [None]:
def weighted_binary_cross_entropy(output, target):
    '''
    Function to calculate the loss function values using a custom loss function for imbalanced dataset
    
    Parameters:
    output: actual label
    target: predicted label
    Return:
    df: updated data frame
    '''
   
    weights=torch.FloatTensor([1/115,1/69])
    if weights is not None:
        assert len(weights) == 2
        loss = weights[1] * (target * torch.log(output)) + \
               weights[0] * ((1 - target) * torch.log(1 - output))
    else:
        loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

    return torch.neg(torch.mean(loss))

In [None]:
def hyperparameters_opt_adam_ANN(X_train, y_train, X_val, y_val,range_learning_rate,range_weight_decay,num_epochs):
    '''
    Function to perform hyperparameter tuning for weight decay and learning rate values
    
    Parameters:
    X_train: Features of train dataset from the given partition
    y_train: Class labels of train dataset from the given partition
    X_val: Features of validation dataset from the given partition
    y_val: Class labels of validation dataset from the given partition
    range_learning_rate: List of values of the learning rate parameter
    range_weight_decay: List of values of the weight decay parameter
    num_epochs: total number of epochs
    
    Return:
    score_list: F1 scores for each hyperparameter combination
    train_accuracy: Accuracy on train dataset for the given partition
    train_f1: F1 score on train dataset for the given partition
    val_accuracy: Accuracy on validation dataset for the given partition
    val_f1: F1 score on validation dataset for the given partition
    '''
   
    score_list=np.zeros((len(range_learning_rate),len(range_weight_decay)))
    train_accuracy=[]
    validation_accuracy=[]
    train_f1_score=[] 
    validation_f1_score=[]
    for i,learning_rate in enumerate(range_learning_rate):
            for j,m in enumerate(range_weight_decay):
                model, optimizer=classification_model(learning_rate,m)
                epoch_val_f1=[]
                epoch_train_f1=[]
                epoch_train_acc=[]
                epoch_val_acc=[]
                for epoch in range(num_epochs):
                                y_pred = model(X_train)
                                y_pred = torch.squeeze(y_pred)
                                train_loss = weighted_binary_cross_entropy(y_pred, y_train)

                                if (epoch+1)%10==0:# after every 10 epochs are completed
                                    y_pred=y_pred.detach().numpy()# detach and convert to numpy
                                    y_pred = np.where(y_pred>0.5, 1, 0)# convert probability to target vector
                                    train_acc, train_f1=performance_measure(y_pred, y_train)
                                    y_val_pred = model(X_val)# finding predicted label for validation set
                                    y_val_pred = torch.squeeze(y_val_pred)
                                    val_loss=weighted_binary_cross_entropy(y_val_pred, y_val)
                                    y_val_pred=y_val_pred.detach().numpy()# detach and convert to numpy
                                    y_val_pred = np.where(y_val_pred>0.5, 1, 0)# convert probability to target vector
                                    val_acc, val_f1=performance_measure(y_val_pred, y_val)
                                    epoch_val_f1.append(val_f1)
                                    epoch_val_acc.append(val_acc)
                                    epoch_train_f1.append(train_f1)
                                    epoch_train_acc.append(train_acc)
                                optimizer.zero_grad()
                                train_loss.backward()
                                optimizer.step()

            train_accuracy.extend(epoch_train_acc) 
            validation_accuracy.extend(epoch_val_acc) 
            train_f1_score.extend(epoch_train_f1) 
            validation_f1_score.extend(epoch_val_f1) 

            score_list[i][j]=sum(epoch_val_f1)/len(epoch_val_f1)# stores the f1 score for each hyperparameter combination
    #Obtaining the mean performance metrics for the data partition
    train_accuracy_final = sum(train_accuracy)/len(train_accuracy)
    train_f1_final = sum(train_f1_score)/len(train_f1_score)
    val_accuracy_final = sum(validation_accuracy)/len(validation_accuracy)
    val_f1_final = sum(validation_f1_score)/len(validation_f1_score)
    
    return score_list,train_accuracy_final,train_f1_final,val_accuracy_final,val_f1_final
    

In [None]:

def ann_model_adam_train(dataset,total_batches):
    '''
    Function to create a column with moving average of temperature from the temperature column of the data frame
    
    Parameters:
    dataset: input data to train the model on
    total_batches: number of batches to divide the dataset in
    Return:
    df: updated data frame
    '''
    count=0
    num_epochs=5000# number of epochs
    param_range_weight_decay=np.array([0.0001,0.001,0.005,0.01,0.05,0.1,0.5])#list of weight decay
    param_range_learning_rate=np.array([0.0001,0.001,0.01,0.1,0.5])# list of learning rate
    score_matrix=np.zeros((total_batches,len(param_range_learning_rate),len(param_range_weight_decay)))
    final=np.zeros((len(param_range_learning_rate),len(param_range_weight_decay)))
    train_acc = []
    train_f1 = []
    val_acc = []
    val_f1 = []            
    while count<total_batches:# cross validation loop
        n=len(dataset)/total_batches
        train_data,val_data=cross_validation(count,dataset,n,False,0)
        X_train=convert_data(train_data[:,0:-1])
        y_train=convert_data(train_data[:,-1])
        X_val=convert_data(val_data[:,0:-1])
        y_val=convert_data(val_data[:,-1])
        score_matrix[count], t_acc, t_f1, v_acc, v_f1=hyperparameters_opt_adam_ANN(X_train, y_train, X_val, y_val,param_range_learning_rate,param_range_weight_decay,num_epochs)
        train_acc.append(t_acc)
        train_f1.append(t_f1)
        val_acc.append(v_acc)
        val_f1.append(v_f1)
        count+=1

        for i,learning_rate in enumerate(param_range_learning_rate):
            for j,momentum in enumerate(param_range_weight_decay):
                final[i,j]=np.mean(score_matrix[:,i,j]) 

    print_acc_val(train_acc,train_f1,val_acc,val_f1)
    index=np.unravel_index(final.argmax(), final.shape)
    optimal_lr=param_range_learning_rate[index[0]]
    optimal_weight_decay=param_range_weight_decay[index[1]]

    print("Optimal value of Learning Rate is",optimal_lr)
    print("Optimal value of Weight Decay is",optimal_weight_decay)    
    
    #Using optimal values to build ANN model                 
    optimal_model, optimizer=classification_model(optimal_lr,optimal_weight_decay)
    loss_list=[]
    training_dataset=convert_data(dataset[:,0:-1])
    for epoch in range(num_epochs):
        y_pred = optimal_model(X_train)
        y_pred = torch.squeeze(y_pred)
        train_loss = weighted_binary_cross_entropy(y_pred, y_train)   
        loss_list.append(train_loss.detach().numpy())
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
    y_pred=y_pred.detach().numpy()
    y_pred = np.where(y_pred>0.5, 1, 0) 
    final_train_accuracy, final_train_f1 = performance_measure(y_pred, y_train)
    print("Final Train Accuracy is", final_train_accuracy)
    print("Final Train F1 score is", final_train_f1)

    return optimal_model


In [None]:
def ann_model_test(model,test):
    '''
    Function to test the ANN model on test data
    
    Parameters:
    model: model created using the optimal learning rate and weight decay values
    test: test data

    Return:
    Prints the test set Accuracy, F1 score and the Confusion Matrix
    '''
    with torch.no_grad():# ensuring gradients are not being calculated for the test set data
        y_true=test[:,-1]#last column containing the class labels
        test=convert_data(test[:,0:-1])
        y_pred=model(test)
        y_pred=y_pred.detach().numpy()
        y_pred_np= np.where(y_pred>0.5, 1, 0)
        test_accuracy=accuracy_score(y_true,y_pred_np)
        conf_matrix=confusion_matrix(y_true,y_pred_np)
        test_f1=f1_score(y_true,y_pred_np)
        
        print("Test Accuracy is", round_off_values(test_accuracy)) 
        print("Test F1 Score is", round_off_values(test_f1))
        plot_confusion_matrix(y_true, y_pred_np)

In [None]:
ann_adam_model=ann_model_adam_train(train_mov_np,8)

ann_model_test(ann_adam_model,test_mov_np)

### References

https://discuss.pytorch.org/t/loss-function-with-small-amount-of-positives/70900