In [None]:
pip install pandas numpy scikit-learn matplotlib seaborn

In [5]:
# importing all the necessary libraries
import pandas as pd # data manipulation
import numpy as np # numerical python - linear algebra
import matplotlib.pyplot as plt # visualization lib
import seaborn as sns
from sklearn.model_selection import train_test_split # sklearn - ML
from sklearn.preprocessing import StandardScaler # scaling

In [7]:
# Load the csv (Dataset) files
location1 = pd.read_csv('Location1.csv')
location2 = pd.read_csv('Location2.csv')
location3 = pd.read_csv('Location3.csv')
location4 = pd.read_csv('Location4.csv')

In [9]:
# See the location 1 
location1.head() # show you top 5 rows

Unnamed: 0,Time,temperature_2m,relativehumidity_2m,dewpoint_2m,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,Power
0,2017-01-02 00:00:00,28.5,85,24.5,1.44,1.26,146,162,1.4,0.1635
1,2017-01-02 01:00:00,28.4,86,24.7,2.06,3.99,151,158,4.4,0.1424
2,2017-01-02 02:00:00,26.8,91,24.5,1.3,2.78,148,150,3.2,0.1214
3,2017-01-02 03:00:00,27.4,88,24.3,1.3,2.69,58,105,1.6,0.1003
4,2017-01-02 04:00:00,27.3,88,24.1,2.47,4.43,58,84,4.0,0.0793


In [11]:
# Add a new column to identify the location
location1['Location'] = 'Location1'
location2['Location'] = 'Location2'
location3['Location'] = 'Location3'
location4['Location'] = 'Location4'

# Concatenate to a dataframe
merged_data = pd.concat([location1, location2, location3, location4], ignore_index=True)

merged_data.head()

Unnamed: 0,Time,temperature_2m,relativehumidity_2m,dewpoint_2m,windspeed_10m,windspeed_100m,winddirection_10m,winddirection_100m,windgusts_10m,Power,Location
0,2017-01-02 00:00:00,28.5,85,24.5,1.44,1.26,146,162,1.4,0.1635,Location1
1,2017-01-02 01:00:00,28.4,86,24.7,2.06,3.99,151,158,4.4,0.1424,Location1
2,2017-01-02 02:00:00,26.8,91,24.5,1.3,2.78,148,150,3.2,0.1214,Location1
3,2017-01-02 03:00:00,27.4,88,24.3,1.3,2.69,58,105,1.6,0.1003,Location1
4,2017-01-02 04:00:00,27.3,88,24.1,2.47,4.43,58,84,4.0,0.0793,Location1


In [None]:
# Plot power generation by location over time
plt.figure(figsize=(12,6))
sns.lineplot(data=merged_data, x='Time', y='Power', hue='Location')
plt.title('Power Generation Across Locations Over Time')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Correlation matrix
corr_matrix = merged_data.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix[['Power']].sort_values(by='Power', ascending=False), 
            annot=True, cmap='coolwarm')
plt.title('Correlation with Power Generation')
plt.show()

In [None]:
# Average power by location
power_by_location = merged_data.groupby('Location')['Power'].mean().sort_values(ascending=False)
print(power_by_location)

In [None]:
# Plot windspeed vs power for Location3 (best performer)
loc3 = merged_data[merged_data['Location']=='Location3']
plt.figure(figsize=(12,6))
plt.plot(loc3['Time'], loc3['windspeed_100m'], label='Windspeed 100m')
plt.plot(loc3['Time'], loc3['Power'], label='Power Output')
plt.title('Location3: Windspeed vs Power Generation')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Prepare data
X = merged_data.drop(['Time', 'Power', 'Location'], axis=1)
y = merged_data['Power']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
print(f"Model RMSE: {mean_squared_error(y_test, predictions, squared=False)}")

In [None]:
# Detect outliers in power generation
Q1 = merged_data['Power'].quantile(0.25)
Q3 = merged_data['Power'].quantile(0.75)
IQR = Q3 - Q1
outliers = merged_data[(merged_data['Power'] < (Q1 - 1.5*IQR)) | 
                      (merged_data['Power'] > (Q3 + 1.5*IQR))]
print(f"Found {len(outliers)} potential outliers in power generation")