## Loading the data

In [19]:
import pandas as pd

In [20]:
x_train = pd.read_parquet('x_train.parquet')
y_train = pd.read_parquet('y_train.parquet')

x_train.shape, y_train.shape

((97988, 117), (97988, 1))

In [21]:
x_train.isna().sum().sum()

0

In [24]:
y_train

Unnamed: 0,RainTomorrow
0,No
1,No
2,No
3,No
4,No
...,...
144548,No
144549,No
144550,No
144551,No


## Training The Model

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
model = LogisticRegression(solver= 'liblinear')
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [28]:
len(x_train.columns.tolist())

117

In [36]:
model.coef_

array([ 9.82884425e-01, -1.61386213e+00,  3.25678117e+00,  7.39513806e-01,
       -1.66573173e+00,  6.71276394e+00, -8.94635484e-01, -1.47861531e+00,
        5.08637269e-01,  5.66897459e+00,  5.75141487e+00, -9.44242351e+00,
       -1.54227331e-01,  1.26926780e+00,  9.61125779e-01,  5.96816971e-01,
       -5.43356879e-01,  4.84106548e-01,  1.26906968e-02,  3.42099959e-01,
       -3.50293529e-01,  1.81450918e-01,  4.25850805e-01, -4.91223966e-03,
        1.54232555e-02,  2.53770131e-01, -1.84029726e-02, -3.04681658e-02,
       -4.67292366e-01, -1.44188577e-01, -5.90816013e-01, -7.44573281e-01,
       -2.49883336e-01, -3.28667138e-01, -5.70917370e-01,  8.01690013e-02,
        1.40349870e-02,  5.99526572e-02, -8.77145236e-01, -4.41486169e-01,
        1.18256767e-02, -4.59478698e-01, -4.60181039e-01, -7.46910804e-02,
        1.94557025e-01,  4.45677291e-01,  6.07370249e-01,  4.30394676e-01,
       -2.08871559e-02,  2.53153843e-01, -3.19395226e-01,  4.06662854e-01,
       -5.79831615e-02, -

In [38]:
pd.DataFrame({
    "Feature" :  x_train.columns.tolist(),
    "Weights" :  model.coef_[0].tolist()
}).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
Feature,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,WindDir3pm_nan,RainToday_No,RainToday_Yes
Weights,0.982884,-1.613862,3.256781,0.739514,-1.665732,6.712764,-0.894635,-1.478615,0.508637,5.668975,...,-0.239792,-0.367651,-0.3216,-0.37622,-0.181048,-0.027701,-0.28154,0.096679,-1.430158,-0.933791


## Making Predection

We will check the outputs of the model in both training, validation and test sets

In [39]:
train_pre = model.predict(x_train)
train_pre, y_train

(array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object),
        RainTomorrow
 0                No
 1                No
 2                No
 3                No
 4                No
 ...             ...
 144548           No
 144549           No
 144550           No
 144551           No
 144552           No
 
 [97988 rows x 1 columns])

For validation we are using accuracy score

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [42]:
accuracy_score(y_train, train_pre)

0.8519002326815528

Checking the training Probablity of the model

In [44]:
train_probs = model.predict_proba(x_train)
train_probs

array([[0.93950629, 0.06049371],
       [0.94333091, 0.05666909],
       [0.95980434, 0.04019566],
       ...,
       [0.98730094, 0.01269906],
       [0.983581  , 0.016419  ],
       [0.87599703, 0.12400297]])

In [52]:
confusion_matrix(y_train, train_pre, normalize='true') * 100

array([[94.61346633,  5.38653367],
       [47.74749977, 52.25250023]])

### Checking the model in validation set

In [55]:
x_val = pd.read_parquet('x_val.parquet')
y_val = pd.read_parquet('y_val.parquet')

x_val.shape, y_val.shape

((17089, 117), (17089, 1))

In [58]:
val_pre = model.predict(x_val)
val_pre, y_val

(array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object),
        RainTomorrow
 2133             No
 2134             No
 2135             No
 2136             No
 2137             No
 ...             ...
 144913           No
 144914           No
 144915           No
 144916           No
 144917           No
 
 [17089 rows x 1 columns])

In [59]:
accuracy_score(y_val, val_pre)

0.854058166071742

In [60]:
confusion_matrix(y_val, val_pre, normalize='true')

array([[0.95810821, 0.04189179],
       [0.53884852, 0.46115148]])

### Test Set

In [61]:
x_test = pd.read_parquet('x_test.parquet')
y_test = pd.read_parquet('y_test.parquet')

x_test.shape, y_test.shape

((25710, 117), (25710, 1))

In [63]:
test_pre = model.predict(x_test)

In [64]:
accuracy_score(y_test, test_pre)

0.8424737456242707

In [66]:
confusion_matrix(y_test, test_pre, normalize='true') * 100

array([[94.7447825 ,  5.2552175 ],
       [51.58798283, 48.41201717]])

## Making Random Model 

In [75]:
import numpy as np
def rand_moldel(inp):
    return np.random.choice(['Yes', 'No'], len(inp))
def say_no(inp):
    return np.full(len(inp), 'No')

In [76]:
accuracy_score(y_test,  rand_moldel(x_test)), accuracy_score(y_test, say_no(x_test))

(0.5006223259432128, 0.7734344612991054)

## Skipping Exporting Model
This is done as this is not at all required at this point of time.