## Importing Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

import shap

pd.set_option('display.max_columns', 500)

  from .autonotebook import tqdm as notebook_tqdm


### Defining functions

In [3]:
def create_lagged_plots(dataframe, lagged_columns: list, n_lags: list = [3,7,14,30]) -> None:
    """Creates plots of rolling mean.

    Args:
        dataframe (pd.DataFrame): Dataframe to create the charts.
        lagged_columns (list): Column names to iterate through.
        n_lags (list, optional): Rolling days. Defaults to [3,7,14,30].
    """

    for day in n_lags:
        for column in lagged_columns:
            plt.figure(figsize=(20,5))
            dataframe[column].rolling(window=day).mean().plot()
            plt.title(f"{day} day rolling mean for {column}")
            plt.show()

def create_histograms(dataframe: pd.DataFrame, columns: list, n_bins: int, kde=True):

    for col in columns:
        plt.figure(figsize=(20,5))
        sns.histplot(dataframe, x=dataframe[col], bins=n_bins, kde=kde)
        plt.title(f"Histplot of {col}, with n_bins = {n_bins}")
        plt.show()

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
X = train.drop('rainfall', axis=1)
y = train['rainfall']

X_train, X_test, y_train, y_test = train_test_split(X,y, shuffle=False, stratify=None, random_state=42)

In [6]:
# Interactive features: 
X_train["cloud_sun_ratio"] = X_train["cloud"] / (X_train["sunshine"] + 1)
X_test["cloud_sun_ratio"] = X_test["cloud"] / (X_test["sunshine"] + 1)

X_train["sun_humidity_interaction"] = X_train["sunshine"] * X_train["humidity"]
X_test["sun_humidity_interaction"] = X_test["sunshine"] * X_test["humidity"]

X_train["saturation_deficit"] = (X_train["temparature"] - X_train["dewpoint"]) / (X_train["humidity"] + 1)
X_test["saturation_deficit"] = (X_test["temparature"] - X_test["dewpoint"]) / (X_test["humidity"] + 1)

X_train["saturation_ratio"] = X_train["dewpoint"] / (X_train["temparature"] + 1)
X_test["saturation_ratio"] = X_test["dewpoint"] / (X_test["temparature"] + 1)

In [7]:
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, y_train)

y_preds = logreg_classifier.predict_proba(X_test)
y_preds_proba = [pred[1] for pred in y_preds]

roc_auc_score(y_test, y_preds_proba)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


np.float64(0.8851267336202773)

In [13]:
# Use SHAP's linear explainer (suitable for logistic regression)
explainer = shap.Explainer(logreg_classifier, X_train)
shap_values = explainer(X_test) 

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.plots.bar(shap_values)

In [None]:
# Generating the submission

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


train["cloud_sun_ratio"] = train["cloud"] / (train["sunshine"] + 1)
test["cloud_sun_ratio"] = test["cloud"] / (test["sunshine"] + 1)

train["sun_humidity_interaction"] = train["sunshine"] * train["humidity"]
test["sun_humidity_interaction"] = test["sunshine"] * test["humidity"]

train["saturation_deficit"] = (train["temparature"] - train["dewpoint"]) / (train["humidity"] + 1)
test["saturation_deficit"] = (test["temparature"] - test["dewpoint"]) / (test["humidity"] + 1)


train["saturation_ratio"] = train["dewpoint"] / (train["temparature"] + 1)
test["saturation_ratio"] = test["dewpoint"] / (test["temparature"] + 1)

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train.drop('rainfall', axis=1)
y = train['rainfall']


xgb_classifier = LogisticRegression()
xgb_classifier.fit(X, y)

y_preds = xgb_classifier.predict_proba(test[X.columns])
y_preds_proba = [pred[1] for pred in y_preds]

test['rainfall'] = y_preds_proba
test[['id', 'rainfall']].to_csv("submission.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
