In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("seattle-weather.csv")
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,drizzle
1,1/2/2012,10.9,10.6,2.8,4.5,rain
2,1/3/2012,0.8,11.7,7.2,2.3,rain
3,1/4/2012,20.3,12.2,5.6,4.7,rain
4,1/5/2012,1.3,8.9,2.8,6.1,rain


In [3]:
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

In [None]:
# Import LabelEncoder from sklearn
# Create LabelEncoder instance
# Transform categorical values to numeric
# fit_transform() both fits encoder to data and transforms it
# Display unique values after encoding
# Shows mapping verification
# Example usage:
# LabelEncoding("weather") - encodes weather column
# df - displays transformed DataFrame

def LabelEncoding(column):
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    df[column]= le.fit_transform(df[column])
    df[column].unique()
LabelEncoding("weather")
df

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.0,12.8,5.0,4.7,0
1,1/2/2012,10.9,10.6,2.8,4.5,2
2,1/3/2012,0.8,11.7,7.2,2.3,2
3,1/4/2012,20.3,12.2,5.6,4.7,2
4,1/5/2012,1.3,8.9,2.8,6.1,2
...,...,...,...,...,...,...
1456,12/27/2015,8.6,4.4,1.7,2.9,2
1457,12/28/2015,1.5,5.0,1.7,1.3,2
1458,12/29/2015,0.0,7.2,0.6,2.6,1
1459,12/30/2015,0.0,5.6,-1.0,3.4,4


In [5]:
# Define weather feature columns for analysis/preprocessing
cols = ['precipitation' , 'temp_max', 'temp_min', 'wind']

In [6]:
# For each column, divide by its maximum value
        # This scales values between 0 and 1
        # where maximum value becomes 1
        # and all other values become proportional fractions
def normalize(df,cols):
    for x in cols:
        df[x] = df[x]/df[x].max()
normalize(df,cols) # Function call
df   # Display modified DataFrame

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,1/1/2012,0.000000,0.359551,0.273224,0.494737,0
1,1/2/2012,0.194991,0.297753,0.153005,0.473684,2
2,1/3/2012,0.014311,0.328652,0.393443,0.242105,2
3,1/4/2012,0.363148,0.342697,0.306011,0.494737,2
4,1/5/2012,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...,...
1456,12/27/2015,0.153846,0.123596,0.092896,0.305263,2
1457,12/28/2015,0.026834,0.140449,0.092896,0.136842,2
1458,12/29/2015,0.000000,0.202247,0.032787,0.273684,1
1459,12/30/2015,0.000000,0.157303,-0.054645,0.357895,4


In [7]:
df = df.drop('date',axis=1)
df

Unnamed: 0,precipitation,temp_max,temp_min,wind,weather
0,0.000000,0.359551,0.273224,0.494737,0
1,0.194991,0.297753,0.153005,0.473684,2
2,0.014311,0.328652,0.393443,0.242105,2
3,0.363148,0.342697,0.306011,0.494737,2
4,0.023256,0.250000,0.153005,0.642105,2
...,...,...,...,...,...
1456,0.153846,0.123596,0.092896,0.305263,2
1457,0.026834,0.140449,0.092896,0.136842,2
1458,0.000000,0.202247,0.032787,0.273684,1
1459,0.000000,0.157303,-0.054645,0.357895,4


In [8]:
# Split data into features (X) and target (y)
#x = df.drop('weather',axis=1)    # Remove 'weather' column (axis=1 means column-wise)
                                # X contains all features except 'weather'
                                # Used as input features for model training

#y = df['weather']               # Extract 'weather' column as target variable
                               # y contains the values we want to predict
                               # Used as target labels for model training

# Note: Common preprocessing step for supervised learning
# X: feature matrix (independent variables)
# y: target vector (dependent variable)


x = df.drop('weather',axis=1)
y = df['weather']

In [9]:
# Import train_test_split function from sklearn for data splitting
#from sklearn.model_selection import train_test_split

# Split dataset into training and testing sets
# x: feature matrix (input variables)
# y: target vector (output variable)
# test_size=0.2: 20% data for testing, 80% for training
# random_state=0: Set seed for reproducible results
#X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Results in:
# X_train: Training features
# X_test:  Testing features
# y_train: Training target values
# y_test:  Testing target values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [10]:
# Import XGBoost Classifier
# XGBoost: eXtreme Gradient Boosting - high performance gradient boosting algorithm
# Advantages:
# - Handles complex non-linear relationships
# - Built-in handling of missing values
# - Regularization to prevent overfitting
# - High prediction accuracy
# - Efficient computation
#from xgboost import XGBClassifier

# Requirements:
# pip install xgboost
# Compatible with scikit-learn API
from xgboost import XGBClassifier

In [11]:
#!pip install xgboost

In [None]:
# Initialize XGBoost Classifier with default parameters
# - learning_rate=0.1 
# - max_depth=6
# - n_estimators=100
#xg = XGBClassifier()

# Train XGBoost model on training data
# X_train: input features
# y_train: target labels
# Model learns patterns to map features to weather predictions
#xg.fit(X_train, y_train)

xg = XGBClassifier()
xg.fit(X_train, y_train)

In [13]:
# Display all hyperparameters of XGBoost model
# Returns dict with key-value pairs like:
#   - base_score: default prediction value
#   - booster: gbtree (tree-based model)
#   - colsample_bylevel: feature sampling ratio
#   - learning_rate: step size shrinkage
#   - max_depth: maximum tree depth
#   - n_estimators: number of trees
#xg.get_params()

xg.get_params()

{'objective': 'multi:softprob',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [14]:
# Import evaluation metrics from sklearn
# classification_report: Shows precision, recall, f1-score for each class
#                       Provides detailed performance analysis
#
# accuracy_score: Calculates ratio of correct predictions
#                Returns single value between 0 and 1
#                accuracy = (true_positives + true_negatives) / total_samples
# from sklearn.metrics import classification_report, accuracy_score

from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Make predictions on TEST DATA using trained XGBoost model
#y_hat = xg.predict(X_test)

# Calculate and display accuracy score (0 to 1)
# Compares predicted values (y_hat) with actual values (y_test)
# Higher score means better model performance
#print(accuracy_score(y_test,y_hat))

# Display detailed classification metrics:
# - Precision: ratio of correct positive predictions
# - Recall: ratio of actual positives correctly identified
# - F1-score: harmonic mean of precision and recall
# - Support: number of samples for each class
#print(classification_report(y_test,y_hat))

y_hat = xg.predict(X_test)
print(accuracy_score(y_test,y_hat))
print(classification_report(y_test,y_hat))

0.757679180887372
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.15      0.07      0.10        29
           2       0.95      0.91      0.93       123
           3       1.00      0.33      0.50         6
           4       0.70      0.85      0.77       125

    accuracy                           0.76       293
   macro avg       0.56      0.43      0.46       293
weighted avg       0.73      0.76      0.74       293



In [22]:
# Define hyperparameter grid for XGBoost model tuning
#grid = {'learning_rate': [0.1, 1, 0.01, 0.001],  # Controls step size in gradient descent
                                             # Lower values: more robust, slower training
                                             # Higher values: faster training, may overshoot
    
# 'gamma': [0, 1, 10, 100]                 # Minimum loss reduction for split
                                            # Controls tree pruning
                                            # Higher values = more conservative model
                                            # 0 = no pruning
#}

# Used with GridSearchCV to find optimal parameters
# Total combinations to test: 4 x 4 = 16 different models
grid = {'learning_rate': [0.1,1, 0.01, 0.001], 'gamma':[0,1,10,100]}

In [17]:
# Import GridSearchCV for hyperparameter optimization
# GridSearchCV performs:
# - Exhaustive search over specified parameter values
# - K-fold cross-validation for each parameter combination
# - Parallel processing support
# - Returns best model and performance metrics
#from sklearn.model_selection import GridSearchCV

# Usage:
# - Takes estimator (model), parameter grid, and CV folds
# - Tries all parameter combinations
# - Uses cross-validation to evaluate each set
# - Helps prevent overfitting through validation
from sklearn.model_selection import GridSearchCV

In [18]:
# Initialize GridSearchCV for XGBoost hyperparameter tuning
# Parameters:
# - XGBClassifier(): Base model to tune
# - grid: Dictionary of parameters to search
# - cv=10: 10-fold cross-validation
#         Data split into 10 parts
#         9 parts for training, 1 for validation
# - verbose=2: Detailed progress messages
#             Shows fitting progress for each fold
#model = GridSearchCV(XGBClassifier(), grid, cv=10, verbose=2)

# This will:
# - Try all parameter combinations
# - Perform 10-fold CV for each combination
# - Track best performing parameters
# - Total fits = n_parameters * cv_folds
model = GridSearchCV(XGBClassifier(), grid, cv=10, verbose=2)

In [None]:
# Train GridSearchCV model with training data
# - Performs exhaustive search over parameter grid
# - Executes 10-fold cross-validation for each combination
# - Total fits = n_parameters * n_folds = 16 * 10 = 160 fits
#model.fit(X_train, y_train)
model.fit(X_train, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END .........................gamma=0, learning_rate=0.1; total time=   0.3s
[CV] END ...........................gamma=0, learning_rate=1; total time=   0.3s
[CV] END ...........................gamma=0, l

In [None]:
# Make predictions using best model found by GridSearchCV
#grid_predictions = model.predict(X_test)

# Calculate and display accuracy of best model
# Shows percentage of correct predictions on test data
#print(accuracy_score(y_test,grid_predictions))

# Display comprehensive classification metrics:
# - Shows precision, recall, f1-score per class
# - Weighted averages across all classes
# - Support (number of samples) for each class
#print(classification_report(y_test,grid_predictions))

grid_predictions = model.predict(X_test)
print(accuracy_score(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

0.8088737201365188
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.25      0.03      0.06        29
           2       0.97      0.90      0.93       123
           3       1.00      0.33      0.50         6
           4       0.72      0.98      0.83       125

    accuracy                           0.81       293
   macro avg       0.59      0.45      0.46       293
weighted avg       0.76      0.81      0.76       293



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Display the best performing model configuration
# Shows optimal hyperparameters found during GridSearchCV
# Includes all parameters of winning model configuration
#print(model.best_estimator_)
print(model.best_estimator_)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
