In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

bucket_name= 'climatechangeanalysis'
folder_name = 'datafiles/'

In [2]:
# Paths to each dataset in S3
path_to_city = f's3://{bucket_name}/{folder_name}Cleaned_GlobalLandTemperaturesByCity.csv'
path_to_state = f's3://{bucket_name}/{folder_name}Cleaned_GlobalLandTemperaturesByState.csv'
path_to_country = f's3://{bucket_name}/{folder_name}Cleaned_GlobalLandTemperaturesByCountry.csv'
path_to_major_city = f's3://{bucket_name}/{folder_name}Cleaned_GlobalLandTemperaturesByMajorCity.csv'

# Loading each dataset into a Pandas DataFrame
city_df = pd.read_csv(path_to_city)
state_df = pd.read_csv(path_to_state)
country_df = pd.read_csv(path_to_country)
major_city_df = pd.read_csv(path_to_major_city)


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [3]:
# Define a preprocessing function to extract the year and calculate averages
def preprocess_temperature_data(df, date_col='dt', avg_temp_col='AverageTemperature', avg_uncertainty_col='AverageTemperatureUncertainty', location_col=None):
    df['Year'] = pd.to_datetime(df[date_col]).dt.year
    df = df.groupby([location_col, 'Year']).agg({
        avg_temp_col: 'mean',
        avg_uncertainty_col: 'mean'
    }).reset_index()
    return df

# Apply the preprocessing function to each DataFrame
city_yearly_avg = preprocess_temperature_data(city_df, location_col='City')
state_yearly_avg = preprocess_temperature_data(state_df, location_col='State')
country_yearly_avg = preprocess_temperature_data(country_df, location_col='Country')
major_city_yearly_avg = preprocess_temperature_data(major_city_df, location_col='City')


In [4]:
# Add a 'Location' and 'Location_Type' column to each dataset before combining
city_yearly_avg['Location'] = city_yearly_avg['City']
city_yearly_avg['Location_Type'] = 'City'

state_yearly_avg['Location'] = state_yearly_avg['State']
state_yearly_avg['Location_Type'] = 'State'

country_yearly_avg['Location'] = country_yearly_avg['Country']
country_yearly_avg['Location_Type'] = 'Country'

major_city_yearly_avg['Location'] = major_city_yearly_avg['City']
major_city_yearly_avg['Location_Type'] = 'MajorCity'

# Combine all datasets
combined_df = pd.concat([city_yearly_avg, state_yearly_avg, country_yearly_avg, major_city_yearly_avg])

# Now drop the original City, State, Country columns as we have a unified Location column
combined_df = combined_df.drop(['City', 'State', 'Country'], axis=1)


In [5]:
# Assuming 'Location' is a column with many unique values
value_counts = combined_df['Location'].value_counts()  # Get the value counts of each category
threshold = 10  # This threshold can be adjusted according to your needs
to_replace = value_counts[value_counts <= threshold].index  # Identify categories to be replaced

combined_df['Location'] = combined_df['Location'].replace(to_replace, 'Other')


In [6]:
from sklearn.preprocessing import LabelEncoder

# Creating the LabelEncoder and encoding the 'Location' column
location_encoder = LabelEncoder()
combined_df['Location_Encoded'] = location_encoder.fit_transform(combined_df['Location'])

In [7]:
joblib.dump(location_encoder, 'location_encoder.pkl')


['location_encoder.pkl']

In [8]:
X = combined_df.drop(['AverageTemperature', 'AverageTemperatureUncertainty', 'Year'], axis=1)
y = combined_df['AverageTemperature']


In [9]:
# Exclude non-numeric 'Location' and 'Location_Type' columns
X = combined_df.drop(['AverageTemperature', 'AverageTemperatureUncertainty', 'Location', 'Location_Type'], axis=1)

# Check for any missing values in your features and target
print(X.isnull().sum())
print(y.isnull().sum())

# Assuming no missing values, split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




Year                0
Location_Encoded    0
dtype: int64
0


In [10]:
# Initialize the Random Forest Regressor with fewer trees
model = RandomForestRegressor(n_estimators=10, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Absolute Error: 0.47
Mean Squared Error: 0.82
R-squared: 0.99


In [12]:
import joblib

# Save the model to disk
joblib.dump(model, 'finalized_model.sav')

# You can load the model later using:
# loaded_model = joblib.load('finalized_model.sav')


['finalized_model.sav']

In [13]:
encoder = LabelEncoder()
combined_df['Location_Encoded'] = encoder.fit_transform(combined_df['Location'])

# Save the encoder for later use
joblib.dump(encoder, 'location_encoder.joblib')

['location_encoder.joblib']

In [15]:
def predict_temperature(year, location_name, model, encoder):
    try:
        encoded_location = encoder.transform([location_name])
    except ValueError:
        print("This location is not in the dataset. Try another one.")
        return None
    features = [[year] + list(encoded_location)]
    prediction = model.predict(features)
    return prediction[0]

# Load your trained model and LabelEncoder
model = joblib.load('finalized_model.sav')
encoder = joblib.load('location_encoder.joblib')


In [20]:
# Function to get user input
def get_user_input():
    year = int(input("Enter the year you want to predict: "))
    location_name = input("Enter the location name: ")
    return year, location_name

# Main interaction loop
def main(model, encoder):
    year, location_name = get_user_input()
    prediction = predict_temperature(year, location_name, model, encoder)
    if prediction is not None:
        print(f"The predicted average temperature for {location_name} in {year} is: {prediction:.2f}")
    else:
        print("Prediction could not be made.")

# Run the interaction
main(model, encoder)


Enter the year you want to predict:  2045
Enter the location name:  Guntur


The predicted average temperature for Guntur in 2045 is: 28.33


