# Logistic Regression (Continued)

#### Import Libraries and Modules

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

#### Config Some Settings

In [2]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

#### Load Data

In [None]:
data_path = r'/workspaces/Lecture2/raw_data/weatherAUS.csv'
raw_df = pd.read_csv(data_path)
raw_df.info()

In [4]:
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)


#### Splitting our Dataset into train, validation and test by the year

In [5]:
year = pd.to_datetime(raw_df['Date']).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

In [7]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

In [8]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [9]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

# Imputing Missing Numberic Data

In [10]:
imputer = SimpleImputer(strategy = 'mean')

In [None]:
raw_df[numeric_cols].isna().sum()

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
imputer.fit(raw_df[numeric_cols])

In [None]:
list(imputer.statistics_)

In [15]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = imputer.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].isna().sum()

In [None]:
print(train_targets.isna().sum())

## Scaling Numeric Features

In [None]:
raw_df[numeric_cols].describe()

In [19]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(raw_df[numeric_cols])

In [None]:
print('Minimum:')
list(scaler.data_min_)

In [None]:
print('Maximum:')
list(scaler.data_max_)

In [23]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].isna().sum()

# Encoding Categorical Data

In [None]:
raw_df[categorical_cols].nunique()

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [None]:
encoder.fit(raw_df[categorical_cols])

In [None]:
encoder.categories_

In [None]:
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
print(encoded_cols)

In [None]:
train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
val_inputs[encoded_cols] = encoder.transform(val_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [32]:
pd.set_option('display.max_columns', None)

In [33]:
test_inputs.drop(columns=['Location'], inplace=True)
val_inputs.drop(columns=['Location'], inplace=True)
train_inputs.drop(columns=['Location'], inplace=True)

# Training a Logistic Regression Model

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
model = LogisticRegression(solver='liblinear')

In [None]:
model.fit(train_inputs[numeric_cols], train_targets)

# Making Prediction And Evaluating The Model 

In [37]:
train_preds = model.predict(train_inputs[numeric_cols])

In [None]:
train_preds

In [None]:
train_targets

In [None]:
train_probs = model.predict_proba(train_inputs[numeric_cols])
train_probs

In [None]:
model.classes_

In [42]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(train_targets, train_preds)