In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime as dt


In [89]:
# Read in csv
df_temps = pd.read_csv('Resources/combined_weather_f.csv')
df_temps.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,736695,56.12,46.04,69.98,Anaheim
1,736696,60.8,51.08,78.98,Anaheim
2,736697,61.34,51.08,73.94,Anaheim
3,736698,62.06,50.0,77.0,Anaheim
4,736699,62.24,51.98,75.02,Anaheim


In [90]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      8758 non-null   int64  
 1   tavg_f    8758 non-null   float64
 2   tmin_f    8758 non-null   float64
 3   tmax_f    8758 non-null   float64
 4   Location  8758 non-null   object 
dtypes: float64(3), int64(1), object(1)
memory usage: 342.2+ KB


In [91]:
# Separate out categories for One Hot Encoding
cats = df_temps.dtypes[df_temps.dtypes == 'object'].index.tolist()
cats

['Location']

In [92]:
# One Hot Encoding for binary classification of locations

enc = OneHotEncoder(sparse_output=False)

encode_df = pd.DataFrame(enc.fit_transform(df_temps[cats]))

encode_df.columns = enc.get_feature_names_out(['Location'])
encode_df

Unnamed: 0,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
8753,0.0,0.0,1.0,0.0
8754,0.0,0.0,1.0,0.0
8755,0.0,0.0,1.0,0.0
8756,0.0,0.0,1.0,0.0


In [93]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location_Anaheim    8758 non-null   float64
 1   Location_Epcot      8758 non-null   float64
 2   Location_Hong Kong  8758 non-null   float64
 3   Location_Paris      8758 non-null   float64
dtypes: float64(4)
memory usage: 273.8 KB


In [94]:
# Concatenate dataframes
merged = pd.merge(df_temps, encode_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,736695,56.12,46.04,69.98,Anaheim,1.0,0.0,0.0,0.0
1,736696,60.8,51.08,78.98,Anaheim,1.0,0.0,0.0,0.0
2,736697,61.34,51.08,73.94,Anaheim,1.0,0.0,0.0,0.0
3,736698,62.06,50.0,77.0,Anaheim,1.0,0.0,0.0,0.0
4,736699,62.24,51.98,75.02,Anaheim,1.0,0.0,0.0,0.0


In [95]:

merged.columns

Index(['date', 'tavg_f', 'tmin_f', 'tmax_f', 'Location', 'Location_Anaheim',
       'Location_Epcot', 'Location_Hong Kong', 'Location_Paris'],
      dtype='object')

In [96]:
# Split into Features and Target
X = merged[['date',  'tmin_f', 'tmax_f', 'Location_Anaheim', 'Location_Epcot',
       'Location_Hong Kong', 'Location_Paris']]
y = df_temps['tavg_f']

In [97]:
# Split into Testing and Training Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [108]:
X_train

Unnamed: 0,date,tmin_f,tmax_f,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
4002,738507,51.08,66.02,0.0,1.0,0.0,0.0
6894,737020,65.30,73.40,0.0,0.0,1.0,0.0
1960,738655,60.98,71.96,1.0,0.0,0.0,0.0
8535,738663,78.44,86.36,0.0,0.0,1.0,0.0
4417,736731,29.48,32.54,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
2895,737400,50.00,77.00,0.0,1.0,0.0,0.0
7813,737941,80.60,93.20,0.0,0.0,1.0,0.0
905,737600,66.02,80.06,1.0,0.0,0.0,0.0
5192,737506,40.10,51.98,0.0,0.0,0.0,1.0


In [98]:
# Create Polynomial Features for use in Regression, Fit model
poly = PolynomialFeatures(degree= 2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [99]:
# Create Linear Regression, fit model
model = LinearRegression()
model.fit(X_train_poly, y_train)

In [100]:
# Make Predictions
predictions = model.predict(X_test_poly)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
2249,71.888254,71.60
1463,50.456482,50.36
6438,74.983490,74.48
5947,55.885581,56.12
6859,71.767462,72.32
...,...,...
285,65.057015,64.76
314,63.930908,62.24
8314,81.270628,80.96
968,78.843232,79.16


In [101]:
# Measure strength and accuracy of model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.7600846208164022
R-squared: 0.995117497012376


In [106]:
X_test_poly[0]

array([1.00000000e+00, 7.36753000e+05, 6.09800000e+01, 8.40200000e+01,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.42804983e+11, 4.49271979e+07, 6.19019871e+07, 0.00000000e+00,
       7.36753000e+05, 0.00000000e+00, 0.00000000e+00, 3.71856040e+03,
       5.12353960e+03, 0.00000000e+00, 6.09800000e+01, 0.00000000e+00,
       0.00000000e+00, 7.05936040e+03, 0.00000000e+00, 8.40200000e+01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [102]:
# User input
location_input = input("Enter the location (Anaheim, Epcot, Hong Kong, Paris): ")
date_input = input("Enter the date (YYYY-MM-DD): ")
tmin_input = float(input("Enter the expected minimum temperature (°F)"))
tmax_input = float(input("Enter the expected maximum temperature (°F)"))

In [103]:
print(location_input, date_input, tmin_input, tmax_input)

Epcot 2025-02-04 68.0 82.0


In [105]:
# Use input with model to predict temperature
# Code developed using Xpert Learning Assistant, 2025
locations = ['Anaheim', 'Epcot', 'Hong Kong', 'Paris']
binary_features = [1 if loc == location_input else 0 for loc in locations]
date_num = pd.to_datetime(date_input).date()
date_ord = date_num.toordinal()
input_features = np.array([[date_ord] + binary_features + [tmin_input, tmax_input]])

input_features

array([[7.39286e+05, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        6.80000e+01, 8.20000e+01]])

In [109]:



input_poly = poly.fit_transform(input_features)
predicted_temperature = model.predict(input_poly)
print(f"The predicted average temperature in {location_input} on {date_input} is: {round(predicted_temperature[0], 2)} °F")

The predicted average temperature in Epcot on 2025-02-04 is: 217958.79 °F
