In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import joblib


In [2]:
# Read in csv
df_temps = pd.read_csv('Resources/combined_weather_f.csv')
df_temps.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,2018-01-01,56.12,46.04,69.98,Anaheim
1,2018-01-02,60.8,51.08,78.98,Anaheim
2,2018-01-03,61.34,51.08,73.94,Anaheim
3,2018-01-04,62.06,50.0,77.0,Anaheim
4,2018-01-05,62.24,51.98,75.02,Anaheim


In [3]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      8758 non-null   object 
 1   tavg_f    8758 non-null   float64
 2   tmin_f    8758 non-null   float64
 3   tmax_f    8758 non-null   float64
 4   Location  8758 non-null   object 
dtypes: float64(3), object(2)
memory usage: 342.2+ KB


In [4]:
#df = df[df['col2'] != 'B']
df_temps = df_temps[df_temps['Location'] != "Epcot"]
df_temps = df_temps[df_temps['Location'] != "Hong Kong"]
df_temps = df_temps[df_temps['Location'] != "Paris"]
df_temps

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,2018-01-01,56.12,46.04,69.98,Anaheim
1,2018-01-02,60.80,51.08,78.98,Anaheim
2,2018-01-03,61.34,51.08,73.94,Anaheim
3,2018-01-04,62.06,50.00,77.00,Anaheim
4,2018-01-05,62.24,51.98,75.02,Anaheim
...,...,...,...,...,...
2186,2023-12-27,56.84,48.92,66.02,Anaheim
2187,2023-12-28,59.00,53.06,68.00,Anaheim
2188,2023-12-29,57.02,48.92,66.02,Anaheim
2189,2023-12-30,57.74,53.06,64.94,Anaheim


In [5]:
df_temps = df_temps[['date', 'tavg_f', 'Location']]

In [6]:
df_temps['date'] = pd.to_datetime(df_temps['date'])
df_temps['date'] = df_temps['date'].apply(lambda x: x.toordinal())
df_temps.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temps['date'] = pd.to_datetime(df_temps['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temps['date'] = df_temps['date'].apply(lambda x: x.toordinal())


Unnamed: 0,date,tavg_f,Location
0,736695,56.12,Anaheim
1,736696,60.8,Anaheim
2,736697,61.34,Anaheim
3,736698,62.06,Anaheim
4,736699,62.24,Anaheim


In [7]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2191 entries, 0 to 2190
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2191 non-null   int64  
 1   tavg_f    2191 non-null   float64
 2   Location  2191 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 68.5+ KB


In [8]:
# Separate out categories for One Hot Encoding
cats = df_temps.dtypes[df_temps.dtypes == 'object'].index.tolist()
cats

['Location']

In [9]:
# One Hot Encoding for binary classification of locations

enc = OneHotEncoder(sparse_output=False)

encode_df = pd.DataFrame(enc.fit_transform(df_temps[cats]))

encode_df.columns = enc.get_feature_names_out(['Location'])
encode_df

Unnamed: 0,Location_Anaheim
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
2186,1.0
2187,1.0
2188,1.0
2189,1.0


In [10]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 1 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Location_Anaheim  2191 non-null   float64
dtypes: float64(1)
memory usage: 17.2 KB


In [11]:
# Concatenate dataframes
merged = pd.merge(df_temps, encode_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0,date,tavg_f,Location,Location_Anaheim
0,736695,56.12,Anaheim,1.0
1,736696,60.8,Anaheim,1.0
2,736697,61.34,Anaheim,1.0
3,736698,62.06,Anaheim,1.0
4,736699,62.24,Anaheim,1.0


In [12]:

merged.columns

Index(['date', 'tavg_f', 'Location', 'Location_Anaheim'], dtype='object')

In [13]:
copy_df = merged.copy()

In [14]:
# Split into Features and Target
X = copy_df[['date',  'Location_Anaheim']]
y = copy_df['tavg_f'].to_numpy()
y = y.flatten()

In [15]:
# Split into Testing and Training Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
# Scale Data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Preview scaled data
X_train_scaled

array([[ 1.37679078,  0.        ],
       [ 1.70167529,  0.        ],
       [ 0.32994069,  0.        ],
       ...,
       [-0.00593025,  0.        ],
       [-1.35726147,  0.        ],
       [-0.06086241,  0.        ]])

In [18]:
# Create model
rf_model = RandomForestRegressor(n_estimators=500, random_state=78)

In [19]:
# Fit model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Predict from model
predictions = rf_model.predict(X_test_scaled)

In [21]:
# Preview prediction results
predictions

array([59.36288, 71.25188, 48.7886 , 57.23996, 68.73296, 67.99064,
       72.65624, 71.52656, 68.11016, 59.82404, 59.47592, 54.21848,
       54.40388, 62.86676, 63.0356 , 69.11168, 70.17116, 54.3398 ,
       71.08052, 74.11568, 53.31848, 64.634  , 57.09452, 77.522  ,
       57.55424, 68.70308, 69.45584, 60.06344, 63.24152, 66.94844,
       73.48352, 62.84516, 51.95876, 74.71904, 68.95544, 80.7962 ,
       56.24672, 59.09108, 62.024  , 57.70976, 75.461  , 67.71812,
       70.07288, 63.9032 , 77.40788, 57.55748, 60.26576, 61.26764,
       72.85676, 61.44944, 64.67648, 56.91416, 60.71684, 57.63524,
       80.29112, 70.40588, 67.05212, 63.57488, 70.18916, 81.96944,
       59.28944, 59.2322 , 59.98064, 58.51904, 48.30404, 53.72456,
       71.00204, 61.57364, 67.96652, 68.25776, 68.7344 , 60.95876,
       74.7014 , 64.13432, 56.37776, 64.79492, 61.72232, 57.12152,
       71.72744, 69.0008 , 72.29624, 76.1504 , 72.03776, 63.32108,
       51.77912, 65.98796, 67.96652, 75.29936, 58.25084, 86.74

In [22]:
# Preview predictions and actual results
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,59.36288,59.00
1,71.25188,77.00
2,48.78860,47.84
3,57.23996,51.80
4,68.73296,69.80
...,...,...
543,65.20172,59.00
544,80.77784,82.22
545,52.04084,51.62
546,70.86956,66.92


In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 5.382658806648126
R-squared: 0.9185615075889408


In [25]:
# User input
location_input = input("Enter the location (Anaheim): ")
date_input = input("Enter the date (YYYY-MM-DD): ")
# tmin_input = float(input("Enter the expected minimum temperature (°F)"))
# tmax_input = float(input("Enter the expected maximum temperature (°F)"))

In [26]:
# View user input
print(location_input, date_input)

Anaheim 2025-01-31


In [28]:
# Use input with model to predict temperature
# Code developed using Xpert Learning Assistant, 2025
locations = ['Anaheim']
binary_features = [1 if loc == location_input else 0 for loc in locations]
date_num = pd.to_datetime(date_input).date()
date_ord = date_num.toordinal()
input_features = np.array([[date_ord] + binary_features])
input_features

array([[739282,      1]])

In [29]:
# Predict using User Input
predicted_temperature = rf_model.predict(input_features)
print(f"The predicted average temperature in {location_input} on {date_input} is: {round(predicted_temperature[0], 2)} °F")

The predicted average temperature in Anaheim on 2025-01-31 is: 56.52 °F


In [31]:
# This code was created by OpenAI's Chat GPT (January 2025)
# Save the model
joblib.dump(rf_model, 'weather_regressor_1.pkl', protocol=2)

['weather_regressor_1.pkl']