<a href="https://www.kaggle.com/code/keyushnisar/rain-prediction?scriptVersionId=185909982" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
raw_df=pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
raw_df

In [None]:
raw_df.dropna(subset=['RainToday','RainTomorrow'],inplace=True)

In [None]:
raw_df.info()

In [None]:
raw_df.Location.nunique()#unique data inputs

In [None]:
import plotly.express as px
import matplotlib
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
px.histogram(raw_df,x='Location',title='Location vs Rainy days',color='RainToday')

In [None]:
px.histogram(raw_df,x='Temp3pm',color='RainTomorrow',title='Rain@3pm vs RainNextDay')

In [None]:
px.scatter(raw_df,x='MinTemp',y='MaxTemp',color='RainToday')

# Training,Validation and Test set
* Training set-used to train the model i.e, compute the loss and adjust the model's weights using an optimaiztion technique
* validation set-used to evalute the model during training, tune model hyperparameters and pick the best version of the model. 
Picking a good validation set is essential for training models
* Test sets-used to compare different models or approaches and report the model's final accuracy.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_val_df,test_df=train_test_split(raw_df,test_size=0.2,random_state=42)


In [None]:
year=pd.to_datetime(raw_df.Date).dt.year
train_df=raw_df[year<2015]
val_df=raw_df[year==2015]
test_df=raw_df[year>2015]


In [None]:
print(train_df.shape)

# creating the input and the targets column

In [None]:
input_cols=list(train_df.columns)[1:-1]
target_col='RainTomorrow'

In [None]:
input_cols

# we can now create inputs and targets for training validation and test sets # 

In [None]:
train_inputs=train_df[input_cols].copy()
train_target=train_df[target_col].copy()

In [None]:
val_inputs=val_df[input_cols].copy()
val_target=val_df[target_col].copy()

In [None]:
test_inputs=test_df[input_cols].copy()
test_target=test_df[target_col].copy()

In [None]:
train_inputs

In [None]:
numeric_cols=train_inputs.select_dtypes(include=np.number).columns.to_list()
categorical_cols=train_inputs.select_dtypes('object').columns.to_list()

In [None]:
train_inputs[numeric_cols].describe()

# Imputing missing numeric data
The process of filling the missing values is called imputation
we'll use the basic method with replacing missing values with the average value in the column usig SimpleImputer class from sklearn.impute


In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer=SimpleImputer(strategy='mean')

In [None]:
train_inputs[numeric_cols].isna().sum()

**the first step in imputation is to fit the imputer to the data i.e compute the chosen stats for each column in the dataset**

In [None]:
imputer.fit(raw_df[numeric_cols])

In [None]:
list(imputer.statistics_)

**The missing values in all the set of data can be filled using the transform method **

In [None]:
train_inputs[numeric_cols]=imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols]=imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols]=imputer.transform(test_inputs[numeric_cols])





In [None]:
train_inputs[numeric_cols]

In [None]:
train_inputs[numeric_cols].isna().sum()

AS WE CAN SEE THERE ARE NO MISSING VALUE

# Scaling numeric feature
another good practice is to scale numeric features to a small range odf values eg(0,1) or(-1,1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler=MinMaxScaler()

In [None]:
scaler.fit(raw_df[numeric_cols])

In [None]:
train_inputs[numeric_cols]=scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols]=scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols]=scaler.transform(test_inputs[numeric_cols])

we can now separately scale the trainng balhh

In [None]:
train_inputs[numeric_cols].describe()
val_inputs[numeric_cols].describe()

In [None]:
train_inputs[numeric_cols].describe()

# encoding categorial data

In [None]:
raw_df.Location.nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder=OneHotEncoder(sparse=False,handle_unknown='error')

In [None]:
raw_df2=raw_df[categorical_cols].fillna('Unknown')

In [None]:
encoder.fit(raw_df2)

In [None]:
encoder.categories_

In [None]:
categorical_cols

In [None]:
encoded_cols=list(encoder.get_feature_names_out(categorical_cols))

In [None]:
encoded_cols

**To perform encoding we use the transform method**

In [None]:
train_inputs[encoded_cols]=encoder.transform(train_inputs[categorical_cols].fillna('Unknown'))
val_inputs[encoded_cols]=encoder.transform(val_inputs[categorical_cols].fillna('Unknown'))
test_inputs[encoded_cols]=encoder.transform(test_inputs[categorical_cols].fillna('Unknown'))

In [None]:
train_inputs

In [None]:
categorical_cols

# Train model using Logistic regression
* we apply sigmoid function to the result to obtain a number betwween 0 and 1
* instead of RSME, the cross entropy loss function is used to get the loss

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model=LogisticRegression(solver='liblinear') #

In [None]:
%%time
model.fit(train_inputs[numeric_cols+encoded_cols],train_target)

In [None]:
model.intercept_#bias

In [None]:
print(numeric_cols+encoded_cols)

In [None]:
model.coef_.tolist()#weights

In [None]:
weight_df=pd.DataFrame({
    'feature':(numeric_cols+encoded_cols),
    'weights': model.coef_.tolist()[0]
}
)

In [None]:
plt.figure(figsize=(10,30))
sns.barplot(data=weight_df.sort_values('weights',ascending=False).head(10),x='weights',y='feature')

# making predicitions and evaluating the model

In [None]:
X_train=train_inputs[numeric_cols+encoded_cols]
X_val=val_inputs[numeric_cols+encoded_cols]
X_test=test_inputs[numeric_cols+encoded_cols]

In [None]:
train_pred=model.predict(X_train)

In [None]:
val_pred=model.predict(X_val)

In [None]:
val_pred

In [None]:
train_pred.tolist()

In [None]:
train_target

we can get the accuracy of the model by using the accuracy_score

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_pred,train_target)

In [None]:
accuracy_score(val_pred,val_target)

In [None]:
train_probs=model.predict_proba(X_train)

this gives the probability of each day about if it will rain or not

In [None]:
train_probs

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(train_target,train_pred,normalize='true') #true negative false positive false negative true positive

In [None]:
confusion_matrix(val_target,val_pred,normalize='true')

WElL the accuracy on train and val sets are 84%..... a good way to verify whether this model has actually learned something useful is to compare its result to a random or dumb model
* lets create two models 
* one that guesses randomly and other that always return no

In [None]:
def random_guess(inputs):
    return np.random.choice(['No','Yes'],len(inputs))

In [None]:
def all_no(inputs):
    return np.full(len(inputs),'No')

In [None]:
accuracy_score(test_target,random_guess(X_test))

In [None]:
accuracy_score(test_target,all_no(X_test))

thankfully my model is lil better than these dumb asf models...


# Training and visualizing Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model=DecisionTreeClassifier(random_state=42)

In [None]:
model.fit(X_train,train_target)

# EVALUATION

In [None]:
train_preds=model.predict(X_train)

In [None]:
train_preds
train_target

In [None]:
accuracy_score(train_preds,train_target)

In [None]:
model.score(X_val,val_target)

**the accuray on the training set was 100% but whereas on validation set it is 80% which is only lil better than the dummy model which always predict no as the answer.
looks like the model has learned the training examples perfectly and basically fails on predicting unseen examples. this shit here is called overfitting and reducing this shit is really necessary**

# Visualization of DT

In [None]:
from sklearn.tree import plot_tree,export_text

In [None]:
plt.figure(figsize=(30,20))
plot_tree(model,feature_names=X_train.columns,max_depth=4,filled=True)

In [None]:
model.tree_.max_depth

Note the gini value.This is the loss function used by decision tree to decide which column should be used for spliting the data.
A low gini score tells a good split and high is a bad split

# Hyperparameter tuning and Overfitting# 
these arguements are called hpyerparameters because they must be configured manually
* max_depth-by reducing the max depth, we can prevent the tree from memorizing all training examples
* max_leaf_nodes- By limiting the number of leaf nodes. this allows branches of the tree to have varying depth

In [None]:
model=DecisionTreeClassifier(max_depth=7,random_state=42)

In [None]:
model.fit(X_train,train_target)

In [None]:
model.score(X_train,train_target)


In [None]:
model.score(X_val,val_target)

well the training accuracy has gone down but validation accuracy has increased well.. it is no longer overfitting....

In [None]:
plt.figure(figsize=(30,20))
plot_tree(model,feature_names=X_train.columns,filled=True)

In [None]:
def max_depth_error(md):
    model = DecisionTreeClassifier(max_depth=md, random_state=42)
    model.fit(X_train, train_target)
    train_acc = 1 - model.score(X_train, train_target)
    val_acc = 1 - model.score(X_val, val_target)
    return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
%%time
errors_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])

In [None]:
errors_df

max_leaf_nodes

In [None]:
model=DecisionTreeClassifier(max_leaf_nodes=128,random_state=42)#2^7

# Random forest
to combine the result of several decision trees trainded with slightly diff parameters. 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model=RandomForestClassifier(n_jobs=-1,random_state=42)

n_jobs allows the random forest to use multiple parallel workers to train decision trees, and random_state ensures we get the same result after every execution

In [None]:
model.fit(X_train,train_target)

In [None]:
model.score(X_train,train_target)

In [None]:
model.score(X_val,val_target)

this time the validation accuracy is much better, in fact better than the best single decision tree we trained above. This general technique is known as ensembling, it works because most errors of individual models cancel out on averaging

we can access individual decision tree using the below method

In [None]:
model.estimators_

In [None]:
plt.figure(figsize=(60,20))
plot_tree(model.estimators_[15],max_depth=2,filled=True)

# Hyperparameter tuning with random forests


In [None]:
base_model=RandomForestClassifier(random_state=42,n_jobs=-1).fit(X_train,train_target)

In [None]:
base_train_Acc=base_model.score(X_train,train_target)
base_train_Acc

**n_estimators**
this arguments control the number of decision tree in RF. the default value is 100. For large datasets,it helps to have large amount of estimators

In [None]:
model=RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators=10)
model.fit(X_train,train_target)

In [None]:
model.score(X_train,train_target)

**max_depth and max_leaf_nodes**
these arguements are passed directly to each decision tree, and control the max depth and max no. of leaf nodes.
By default,no maximum depth is specified which is why we have a 100% accuracy
* lets make a function to test different parameter

In [None]:
def test_Par(**params):
    model=RandomForestClassifier(n_jobs=-1,random_state=42,**params).fit(X_train,train_target)
    return model.score(X_train,train_target),model.score(X_val,val_target)

In [None]:
test_Par(max_depth=17,max_leaf_nodes=1024,n_estimators=100,max_features=20)

**max_features**
instead of picking all features(columns)for every split,we can specify that only a fraction be chosen randomly to figure out a split
max_features:("auto","sqrt","log2"),iint or float
* if float,then max_feature is a fraction and round(max_features*n_features)are considered
* if  "auto",then max_features=sqrt(n_features)(same for sqrt)(in general better)
* if "log2",then max_features=log2(n_features)


In [None]:
test_Par(max_features=20)

**min_samples_split and min_samples_leaf**
by default, the decision tree classifier tries to split every node that has 2 or more. we can also increase the values of these arguments to change this behaviour and reduce overfittin

In [None]:
test_Par(min_samples_split=5,min_samples_leaf=20)

****min_impurity ****
* this is used to control the threshold for splitting nodes. A node will split if this split induces a decrease of the impurityy(Gini index) greater than or equal to this value. it's default value is 0 and you can increase it to reduce overfitting 

In [None]:
test_Par(min_impurity_decrease=1e-6)

**Bootstrap, max_samples**
* By default a random forest doesnt use the entire dataset for training each decision tree. instead it applies a tech called bootstrapping. for each tree, rows from the dataset are picked one by one randomly, with replacemnt i.e some rows may not show up at all while some might show multiple times

In [None]:
test_Par(bootstrap=True,max_samples=0.9)#i.e just 90% of the rows

**class_weight**

In [None]:
model.classes_
test_Par(class_weight='balanced')

In [None]:
test_Par(class_weight={'No':1,'Yes':2})

# putting all together

In [None]:
  model=RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators=1000,max_features=20,max_depth=1024,class_weight={'No':1,'Yes':1.5})

In [None]:
model.fit(X_train,train_target)
model.score(X_val,val_target)

# conclusion
we've increased it to 84.5% to 85.7%.
Depending on the datasets.... hypertuning parameters may or may not see significant results.. REASONS
* we may not have found the right mix of hyperparameters
* we might have reached the limitations pf the technique... so we should try using other models.. maybe like gradient boosting
* we may have reached the limits of what we can predict with the given amount of data.. need more data
* might need more features(columns)... feature engineeirng...
* test set will always give us less accuracy than val and trainig set

In [None]:
model.score(X_test,test_target)

# making predcitions on single input
lets say we got a dictonary containing new data to use this model

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
new_in_df=pd.DataFrame([new_input])

In [None]:
new_in_df

we created a DataFrame of this dict
* we must apply the same steps we applied during preprocessing
1. imputation of missing values
1. scaling of numeric values
1. encoding categorical data

In [None]:
new_in_df[numeric_cols]=imputer.transform(new_in_df[numeric_cols])
new_in_df[numeric_cols]=scaler.transform(new_in_df[numeric_cols])
new_in_df[encoded_cols]=encoder.transform(new_in_df[categorical_cols])

In [None]:
X_new_input=new_in_df[numeric_cols+encoded_cols]

In [None]:
X_new_input

WE CAN NOW PREDICT ON THIS NEW INPUT

In [None]:
predictions=model.predict(X_new_input)

In [None]:
predictions.

SO my model predcits that it will rain ..... way to go

In [None]:
prob=model.predict_proba(X_new_input)[0]

In [None]:
prob

# saving and loading trained models
we can save the parameters(weights and biases) of our trained model to disk,

In [None]:
import joblib

In [None]:
aussie_rain={
    'model':model,
    'imputer':imputer,
    'scaler':scaler,
    'encoder':encoder,
    'input cols':input_cols,
    'target_Cols':target_col,
    'numeric_cols':numeric_cols,
    'categorical cols':categorical_cols,
    'encoder_cols':encoded_cols
}

In [None]:
joblib.dump(aussie_rain,'aussie_rain.joblib')

In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Download the dataset
od.download('https://www.kaggle.com/jsphyg/weather-dataset-rattle-package')
raw_df = pd.read_csv('weather-dataset-rattle-package/weatherAUS.csv')
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

# Create training, validation and test sets
year = pd.to_datetime(raw_df.Date).dt.year
train_df, val_df, test_df = raw_df[year < 2015], raw_df[year == 2015], raw_df[year > 2015]

# Create inputs and targets
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'
train_inputs, train_targets = train_df[input_cols].copy(), train_df[target_col].copy()
val_inputs, val_targets = val_df[input_cols].copy(), val_df[target_col].copy()
test_inputs, test_targets = test_df[input_cols].copy(), test_df[target_col].copy()

# Identify numeric and categorical columns
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()[:-1]
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

# Impute missing numerical values
imputer = SimpleImputer(strategy = 'mean').fit(raw_df[numeric_cols])
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

# Scale numeric features
scaler = MinMaxScaler().fit(raw_df[numeric_cols])
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(raw_df[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

# Save processed data to disk
train_inputs.to_parquet('train_inputs.parquet')
val_inputs.to_parquet('val_inputs.parquet')
test_inputs.to_parquet('test_inputs.parquet')
pd.DataFrame(train_targets).to_parquet('train_targets.parquet')
pd.DataFrame(val_targets).to_parquet('val_targets.parquet')
pd.DataFrame(test_targets).to_parquet('test_targets.parquet')

# Load processed data from disk
train_inputs = pd.read_parquet('train_inputs.parquet')
val_inputs = pd.read_parquet('val_inputs.parquet')
test_inputs = pd.read_parquet('test_inputs.parquet')
train_targets = pd.read_parquet('train_targets.parquet')[target_col]
val_targets = pd.read_parquet('val_targets.parquet')[target_col]
test_targets = pd.read_parquet('test_targets.parquet')[target_col]