In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

from sklearn.metrics import mean_squared_error
# Load the dataset
file_path = "/content/climate_change_data.csv"
df = pd.read_csv(file_path)

# Display basic information and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            10000 non-null  object 
 1   Location        10000 non-null  object 
 2   Country         10000 non-null  object 
 3   Temperature     10000 non-null  float64
 4   CO2 Emissions   10000 non-null  float64
 5   Sea Level Rise  10000 non-null  float64
 6   Precipitation   10000 non-null  float64
 7   Humidity        10000 non-null  float64
 8   Wind Speed      10000 non-null  float64
dtypes: float64(6), object(3)
memory usage: 703.3+ KB


(None,
                             Date          Location        Country  \
 0  2000-01-01 00:00:00.000000000   New Williamtown         Latvia   
 1  2000-01-01 20:09:43.258325832      North Rachel   South Africa   
 2  2000-01-02 16:19:26.516651665  West Williamland  French Guiana   
 3  2000-01-03 12:29:09.774977497       South David        Vietnam   
 4  2000-01-04 08:38:53.033303330    New Scottburgh        Moldova   
 
    Temperature  CO2 Emissions  Sea Level Rise  Precipitation   Humidity  \
 0    10.688986     403.118903        0.717506      13.835237  23.631256   
 1    13.814430     396.663499        1.205715      40.974084  43.982946   
 2    27.323718     451.553155       -0.160783      42.697931  96.652600   
 3    12.309581     422.404983       -0.475931       5.193341  47.467938   
 4    13.210885     410.472999        1.135757      78.695280  61.789672   
 
    Wind Speed  
 0   18.492026  
 1   34.249300  
 2   34.124261  
 3    8.554563  
 4    8.001164  )

In [13]:
df.head()

Unnamed: 0,Date,Location,Country,Temperature,CO2 Emissions,Sea Level Rise,Precipitation,Humidity,Wind Speed
0,2000-01-01 00:00:00.000000000,New Williamtown,Latvia,10.688986,403.118903,0.717506,13.835237,23.631256,18.492026
1,2000-01-01 20:09:43.258325832,North Rachel,South Africa,13.81443,396.663499,1.205715,40.974084,43.982946,34.2493
2,2000-01-02 16:19:26.516651665,West Williamland,French Guiana,27.323718,451.553155,-0.160783,42.697931,96.6526,34.124261
3,2000-01-03 12:29:09.774977497,South David,Vietnam,12.309581,422.404983,-0.475931,5.193341,47.467938,8.554563
4,2000-01-04 08:38:53.033303330,New Scottburgh,Moldova,13.210885,410.472999,1.135757,78.69528,61.789672,8.001164


In [14]:
weights = {
    "CO2 Emissions": 0.3,
    "Sea Level Rise": 0.2,
    "Temperature": 0.2,
    "Precipitation": 0.1,
    "Humidity": 0.1,
    "Wind Speed": 0.1,
}
if "Impact Score" not in df.columns:
    df["Impact Score"] = (
        df["CO2 Emissions"] * weights["CO2 Emissions"] +
        df["Sea Level Rise"] * weights["Sea Level Rise"] +
        df["Temperature"] * weights["Temperature"] +
        df["Precipitation"] * weights["Precipitation"] +
        df["Humidity"] * weights["Humidity"] +
        df["Wind Speed"] * weights["Wind Speed"]
    )

features = ["Temperature", "CO2 Emissions", "Sea Level Rise", "Precipitation", "Humidity", "Wind Speed"]
X = df[features]
y = df["Impact Score"]


# Split dataset

In [15]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost model

In [16]:
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=50, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [17]:
y_pred = model.predict(X_test)

In [18]:
mae = mean_absolute_error(y_test, y_pred)
print(f"mae: {mae}")

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")


mae: 0.6491465153728359
RMSE: 0.9218568271289393


In [19]:
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

R² Score: 0.9963087334553506


# RF

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [20]:
r2_rf = r2_score(y_test, y_pred_rf)
print(f"R² Score: {r2_rf}")

R² Score: 0.9948041534968234


In [21]:
mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Mean Absolute Error: {mae}")
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

Mean Absolute Error: 0.845857870743435
RMSE: 1.093714398070815


In [22]:

submission_df = pd.DataFrame({
    "Actual Impact Score": y_test.values,
    "Predicted Impact Score": y_pred_rf
})

# Save to CSV
submission_file = "submission1.csv"
submission_df.to_csv(submission_file, index=False)

print(f"Submission file saved as {submission_file}")

Submission file saved as submission1.csv


In [23]:
import pickle

# Save the trained model
with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [24]:
import pandas as pd

# Create a DataFrame with actual and predicted values
submission_df = pd.DataFrame({
    "Actual Impact Score": y_test.values,
    "Predicted Impact Score": y_pred
})

# Save to CSV
submission_file = "submission.csv"
submission_df.to_csv(submission_file, index=False)

print(f"Submission file saved as {submission_file}")


Submission file saved as submission.csv
