In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import joblib
from datetime import date
import plotly.graph_objects as go


In [33]:
# Read in csv
df_temps = pd.read_csv('Resources/combined_weather_f.csv')
df_temps.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,2018-01-01,56.12,46.04,69.98,Anaheim
1,2018-01-02,60.8,51.08,78.98,Anaheim
2,2018-01-03,61.34,51.08,73.94,Anaheim
3,2018-01-04,62.06,50.0,77.0,Anaheim
4,2018-01-05,62.24,51.98,75.02,Anaheim


In [34]:
# Create Plot for t_avg and date
fig = px.scatter(x=df_temps['date'], y= df_temps['tavg_f'])
fig.update_layout(title='Temperature in Anaheim, CA')
fig.update_yaxes(title='Temperature')
fig.update_xaxes(title='Date')

fig.show()

In [4]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      8758 non-null   object 
 1   tavg_f    8758 non-null   float64
 2   tmin_f    8758 non-null   float64
 3   tmax_f    8758 non-null   float64
 4   Location  8758 non-null   object 
dtypes: float64(3), object(2)
memory usage: 342.2+ KB


In [5]:
#df = df[df['col2'] != 'B']
df_temps = df_temps[df_temps['Location'] != "Epcot"]
df_temps = df_temps[df_temps['Location'] != "Hong Kong"]
df_temps = df_temps[df_temps['Location'] != "Paris"]
df_temps

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,2018-01-01,56.12,46.04,69.98,Anaheim
1,2018-01-02,60.80,51.08,78.98,Anaheim
2,2018-01-03,61.34,51.08,73.94,Anaheim
3,2018-01-04,62.06,50.00,77.00,Anaheim
4,2018-01-05,62.24,51.98,75.02,Anaheim
...,...,...,...,...,...
2186,2023-12-27,56.84,48.92,66.02,Anaheim
2187,2023-12-28,59.00,53.06,68.00,Anaheim
2188,2023-12-29,57.02,48.92,66.02,Anaheim
2189,2023-12-30,57.74,53.06,64.94,Anaheim


In [6]:
df_temps['date'] = pd.to_datetime(df_temps['date'])
df_temps['date'] = df_temps['date'].apply(lambda x: x.toordinal())
df_temps.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,736695,56.12,46.04,69.98,Anaheim
1,736696,60.8,51.08,78.98,Anaheim
2,736697,61.34,51.08,73.94,Anaheim
3,736698,62.06,50.0,77.0,Anaheim
4,736699,62.24,51.98,75.02,Anaheim


In [7]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2191 entries, 0 to 2190
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2191 non-null   int64  
 1   tavg_f    2191 non-null   float64
 2   tmin_f    2191 non-null   float64
 3   tmax_f    2191 non-null   float64
 4   Location  2191 non-null   object 
dtypes: float64(3), int64(1), object(1)
memory usage: 102.7+ KB


In [8]:
# Separate out categories for One Hot Encoding
cats = df_temps.dtypes[df_temps.dtypes == 'object'].index.tolist()
cats

['Location']

In [9]:
# One Hot Encoding for binary classification of locations

enc = OneHotEncoder(sparse_output=False)

encode_df = pd.DataFrame(enc.fit_transform(df_temps[cats]))

encode_df.columns = enc.get_feature_names_out(['Location'])
encode_df

Unnamed: 0,Location_Anaheim
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
2186,1.0
2187,1.0
2188,1.0
2189,1.0


In [10]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2191 entries, 0 to 2190
Data columns (total 1 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Location_Anaheim  2191 non-null   float64
dtypes: float64(1)
memory usage: 17.2 KB


In [11]:
# Concatenate dataframes
merged = pd.merge(df_temps, encode_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location,Location_Anaheim
0,736695,56.12,46.04,69.98,Anaheim,1.0
1,736696,60.8,51.08,78.98,Anaheim,1.0
2,736697,61.34,51.08,73.94,Anaheim,1.0
3,736698,62.06,50.0,77.0,Anaheim,1.0
4,736699,62.24,51.98,75.02,Anaheim,1.0


In [12]:

merged.columns

Index(['date', 'tavg_f', 'tmin_f', 'tmax_f', 'Location', 'Location_Anaheim'], dtype='object')

In [13]:
copy_df = merged.copy()

In [14]:
# Split into Features and Target
X = copy_df[['date',  'tmin_f', 'tmax_f', 'Location_Anaheim']]
y = copy_df['tavg_f'].to_numpy()
y = y.flatten()

In [15]:
# Split into Testing and Training Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [40]:
X_train.head()

Unnamed: 0,date,tmin_f,tmax_f,Location_Anaheim
1977,738672,60.08,68.0,1.0
2184,738879,46.94,66.92,1.0
1310,738005,66.02,91.94,1.0
1102,737797,44.96,71.06,1.0
613,737308,71.96,91.94,1.0


In [42]:
X_test.head()

Unnamed: 0,date,tmin_f,tmax_f,Location_Anaheim
353,737048,46.04,75.92,1.0
1631,738326,60.08,96.08,1.0
414,737109,35.96,59.0,1.0
1827,738522,42.98,59.0,1.0
851,737546,64.04,78.08,1.0


In [16]:
# Scale Data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Preview scaled data
X_train_scaled

array([[ 1.37679078,  0.32827817, -0.87874083,  0.        ],
       [ 1.70167529, -1.22399288, -0.99015582,  0.        ],
       [ 0.32994069,  1.02998974,  1.59095818,  0.        ],
       ...,
       [-0.00593025, -1.56421667, -0.47021919,  0.        ],
       [-1.35726147,  1.49779745,  0.6624999 ,  0.        ],
       [-0.06086241, -1.09640896, -0.26595837,  0.        ]])

In [18]:
# Create model
rf_model = RandomForestRegressor(n_estimators=500, random_state=78)

In [19]:
# Fit model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [20]:
# Predict from model
predictions = rf_model.predict(X_test_scaled)

In [21]:
# Preview prediction results
predictions

array([59.35424, 77.27216, 48.61976, 51.16568, 69.58112, 64.62212,
       71.78684, 72.40172, 68.15444, 56.147  , 56.29388, 56.201  ,
       54.07196, 61.80512, 64.71752, 72.30308, 71.74328, 57.33572,
       71.18204, 74.77448, 52.60964, 63.08204, 55.29452, 78.29456,
       58.94132, 69.13508, 69.65888, 63.78728, 60.28772, 67.35704,
       73.21568, 58.96868, 54.1922 , 79.26584, 72.20264, 81.98708,
       57.76808, 58.62704, 56.25644, 57.17912, 75.18164, 64.46336,
       71.32352, 58.31384, 74.81732, 55.45724, 57.5564 , 61.1564 ,
       75.73136, 61.1564 , 64.69268, 54.14216, 66.62372, 59.58428,
       81.64112, 71.36204, 63.8042 , 63.70268, 67.3088 , 81.79412,
       58.31528, 57.66044, 57.16328, 58.63316, 48.29432, 54.76892,
       71.63672, 58.17452, 67.45388, 66.4466 , 69.82088, 61.14488,
       78.34244, 64.04   , 57.46388, 65.939  , 59.36504, 54.74264,
       70.78208, 64.8086 , 75.12584, 77.27504, 78.7334 , 63.90356,
       51.39032, 66.82208, 64.91228, 69.30428, 58.45316, 82.87

In [54]:
# Preview predictions and actual results
predict_df = pd.DataFrame({"Date": X_test['date'], "Prediction": predictions, "Actual": y_test})
predict_df

Unnamed: 0,Date,Prediction,Actual
353,737048,59.35424,59.00
1631,738326,77.27216,77.00
414,737109,48.61976,47.84
1827,738522,51.16568,51.80
851,737546,69.58112,69.80
...,...,...,...
11,736706,60.05156,59.00
210,736905,81.67424,82.22
53,736748,52.13984,51.62
2120,738815,65.53868,66.92


In [59]:
# Code in this cell created by Google Gemini AI Tool (January 2025)

def convert_ordinal_to_datetime(ordinal_date):
    return dt.fromordinal(int(ordinal_date))

# Convert ordinal time to datetime
predict_df['Date'] = predict_df['Date'].apply(convert_ordinal_to_datetime)

predict_df.head()

Unnamed: 0,Date,Prediction,Actual
353,2018-12-20,59.35424,59.0
1631,2022-06-20,77.27216,77.0
414,2019-02-19,48.61976,47.84
1827,2023-01-02,51.16568,51.8
851,2020-05-01,69.58112,69.8


In [65]:
predict_df = predict_df.sort_values(by='Date')
predict_df.head()

Unnamed: 0,Date,Prediction,Actual
3,2018-01-04,61.9466,62.06
6,2018-01-07,63.60764,62.78
8,2018-01-09,58.33076,57.56
11,2018-01-12,60.05156,59.0
17,2018-01-18,61.19492,62.6


In [69]:
# Create Plot for Predicted vs Actual Temperature
# Code generated by Xpert Learning Assistant (January 2025)
fig = go.Figure()
fig.add_trace(go.Scatter(x=predict_df['Date'], y=predict_df['Prediction'], mode='lines', name='Predicted Temp'))
fig.add_trace(go.Scatter(x=predict_df['Date'], y=predict_df['Actual'], mode='lines', name='Actual Temp'))
fig.update_layout(title='Temperature in Anaheim, CA')
fig.update_yaxes(title='Temperature')
fig.update_xaxes(title='Date')
fig.show()
fig.write_image("weather-model.png")
fig.write_html("weather-model.html")

In [23]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 1.0586475257518286
R-squared: 0.9839828862298604


In [24]:
# User input
location_input = input("Enter the location (Anaheim): ")
date_input = input("Enter the date (YYYY-MM-DD): ")
tmin_input = float(input("Enter the expected minimum temperature (°F)"))
tmax_input = float(input("Enter the expected maximum temperature (°F)"))

In [25]:
# View user input
print(location_input, date_input, tmin_input, tmax_input)

Anaheim 2025-01-31 65.0 83.0


In [26]:
# Use input with model to predict temperature
# Code developed using Xpert Learning Assistant, 2025
locations = ['Anaheim']
binary_features = [1 if loc == location_input else 0 for loc in locations]
date_num = pd.to_datetime(date_input).date()
date_ord = date_num.toordinal()
input_features = np.array([[date_ord] + binary_features + [tmin_input, tmax_input]])
input_features

array([[7.39282e+05, 1.00000e+00, 6.50000e+01, 8.30000e+01]])

In [27]:
# Predict using User Input
predicted_temperature = rf_model.predict(input_features)
print(f"The predicted average temperature in {location_input} on {date_input} is: {round(predicted_temperature[0], 2)} °F")

The predicted average temperature in Anaheim on 2025-01-31 is: 83.13 °F


In [29]:
# This code was created by OpenAI's Chat GPT (January 2025)
# Save the model
joblib.dump(rf_model, 'weather_regressor.pkl', protocol=2)

['weather_regressor.pkl']