In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime as dt


In [18]:
# Read in csv
df_temps = pd.read_csv('combined_weather_f_simple.csv')
df_temps.head()

Unnamed: 0,date,tavg_f,Location
0,736695,56.12,Anaheim
1,736696,60.8,Anaheim
2,736697,61.34,Anaheim
3,736698,62.06,Anaheim
4,736699,62.24,Anaheim


In [19]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8245 entries, 0 to 8244
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      8245 non-null   int64  
 1   tavg_f    8245 non-null   float64
 2   Location  8245 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 193.4+ KB


In [20]:
# Separate out categories for One Hot Encoding
cats = df_temps.dtypes[df_temps.dtypes == 'object'].index.tolist()
cats

['Location']

In [21]:
# One Hot Encoding for binary classification of locations

enc = OneHotEncoder(sparse_output=False)

encode_df = pd.DataFrame(enc.fit_transform(df_temps[cats]))

encode_df.columns = enc.get_feature_names_out(['Location'])
encode_df

Unnamed: 0,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
8240,0.0,0.0,1.0,0.0
8241,0.0,0.0,1.0,0.0
8242,0.0,0.0,1.0,0.0
8243,0.0,0.0,1.0,0.0


In [22]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8245 entries, 0 to 8244
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location_Anaheim    8245 non-null   float64
 1   Location_Epcot      8245 non-null   float64
 2   Location_Hong Kong  8245 non-null   float64
 3   Location_Paris      8245 non-null   float64
dtypes: float64(4)
memory usage: 257.8 KB


In [23]:
# Concatenate dataframes
merged = pd.merge(df_temps, encode_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0,date,tavg_f,Location,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,736695,56.12,Anaheim,1.0,0.0,0.0,0.0
1,736696,60.8,Anaheim,1.0,0.0,0.0,0.0
2,736697,61.34,Anaheim,1.0,0.0,0.0,0.0
3,736698,62.06,Anaheim,1.0,0.0,0.0,0.0
4,736699,62.24,Anaheim,1.0,0.0,0.0,0.0


In [24]:

merged.columns

Index(['date', 'tavg_f', 'Location', 'Location_Anaheim', 'Location_Epcot',
       'Location_Hong Kong', 'Location_Paris'],
      dtype='object')

In [25]:
# Split into Features and Target
X = merged[['date',  'Location_Anaheim', 'Location_Epcot',
       'Location_Hong Kong', 'Location_Paris']]
y = df_temps['tavg_f']

In [26]:
# Split into Testing and Training Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Create Polynomial Features for use in Regression, Fit model
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [32]:
# Create Linear Regression, fit model
model = LinearRegression()
model.fit(X_train_poly, y_train)

In [33]:
# Make Predictions
predictions = model.predict(X_test_poly)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
6475,55.485929,65.66
4774,55.177411,46.04
3248,73.941652,61.70
2106,65.522251,72.14
7505,77.335019,65.84
...,...,...
1316,65.946166,75.20
799,65.670717,62.42
1344,65.948607,74.84
6566,76.689227,79.70


In [34]:
# Measure strength and accuracy of model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 96.61972988041158
R-squared: 0.4529517524041037


In [35]:
# User input
location_input = input("Enter the location (Anaheim, Epcot, Hong Kong, Paris): ")
date_input = input("Enter the date (YYYY-MM-DD): ")

In [38]:
print(location_input, date_input)

Epcot 2025-02-04


In [37]:
# Use input with model to predict temperature
# Code developed using Xpert Learning Assistant, 2025
locations = ['Anaheim', 'Epcot', 'Hong Kong', 'Paris']
binary_features = [1 if loc == location_input else 0 for loc in locations]
date_num = pd.to_datetime(date_input).date()
date_ord = date_num.toordinal()
input_features = np.array([[date_ord] + binary_features])
input_poly = poly.transform(input_features)
predicted_temperature = model.predict(input_poly)
print(f"The predicted average temperature in {location_input} on {date_input} is: {round(predicted_temperature[0], 2)} °F")
    
    

The predicted average temperature in Epcot on 2025-02-04 is: 72.86 °F


