In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns
from scipy import stats
import os

In [None]:
# Load the CSV file
file_path = 'sierraleone-bumbuna.csv'  
data = pd.read_csv(file_path)

# Calculate summary statistics
summary_statistics = data.describe()

# Display the summary statistics
print("Summary Statistics:\n", summary_statistics)

In [None]:
#Data quality check
# Check for missing values
missing_values = data.isnull().sum()
# Display missing values count
print("Missing Values:\n", missing_values[missing_values > 0])

In [None]:
# Check for negative values in specific columns
negative_entries = {
    'GHI': data[data['GHI'] < 0],
    'DNI': data[data['DNI'] < 0],
    'DHI': data[data['DHI'] < 0],
    'ModA': data[data['ModA'] < 0],
    'ModB': data[data['ModB'] < 0],
    'WS': data[data['WS'] < 0],
    'WSgust': data[data['WSgust'] < 0]
}

# Display negative entries
for column, entries in negative_entries.items():
    if not entries.empty:
        print(f"\nNegative values in {column}:\n", entries)

In [None]:
# Detecting outliers using IQR method for sensor readings and wind speed data
def detect_outliers_iqr(column):
    Q1 = np.percentile(column, 25)
    Q3 = np.percentile(column, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column[(column < lower_bound) | (column > upper_bound)]

# Checking for outliers
outliers = {
    'ModA': detect_outliers_iqr(data['ModA']),
    'ModB': detect_outliers_iqr(data['ModB']),
    'WS': detect_outliers_iqr(data['WS']),
    'WSgust': detect_outliers_iqr(data['WSgust'])
}

# Display outliers
for column, outlier_values in outliers.items():
    if not outlier_values.empty:
        print(f"\nOutliers in {column}:\n", outlier_values)
        

In [None]:
# Time Series Analysis
# Convert the Timestamp column to datetime format
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# Set Timestamp as the index
data.set_index('Timestamp', inplace=True)

# Plot time series data for relevant variables
plt.figure(figsize=(14, 8))

# Line plots
plt.subplot(2, 1, 1)
plt.plot(data.index, data['GHI'], label='GHI', color='orange')
plt.plot(data.index, data['DNI'], label='DNI', color='blue')
plt.plot(data.index, data['DHI'], label='DHI', color='green')
plt.plot(data.index, data['Tamb'], label='Tamb', color='red')
plt.title('Time Series of GHI, DNI, DHI, and Tamb')
plt.xlabel('Time')
plt.ylabel('Irradiance / Temperature')
plt.legend()
plt.grid()

# Area plots
plt.subplot(2, 1, 2)
plt.fill_between(data.index, data['GHI'], color='orange', alpha=0.3)
plt.fill_between(data.index, data['DNI'], color='blue', alpha=0.3)
plt.fill_between(data.index, data['DHI'], color='green', alpha=0.3)
plt.fill_between(data.index, data['Tamb'], color='red', alpha=0.3)
plt.title('Area Plot of GHI, DNI, DHI, and Tamb')
plt.xlabel('Time')
plt.ylabel('Irradiance / Temperature')
plt.grid()

plt.tight_layout()
plt.show()

# Step 5: Evaluate the impact of Cleaning
plt.figure(figsize=(14, 5))
sns.lineplot(data=data, x=data.index, y='ModA', hue='Cleaning', palette='Set1', marker='o', label='ModA')
sns.lineplot(data=data, x=data.index, y='ModB', hue='Cleaning', palette='Set2', marker='x', label='ModB')
plt.title('Sensor Readings Over Time with Cleaning Status')
plt.xlabel('Time')
plt.ylabel('Sensor Readings')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate Correlation Matrix
correlation_columns = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
correlation_matrix = data[correlation_columns].corr()

# Visualize Correlations with Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Heatmap')
plt.show()

# Pair Plot for Correlation Exploration
sns.pairplot(data[correlation_columns])
plt.suptitle('Pair Plot of Solar Radiation and Temperature Measures', y=1.02)
plt.show()

# Investigate Wind Conditions with Scatter Matrix
wind_columns = ['WS', 'WSgust', 'WD', 'GHI', 'DNI', 'DHI']
scatter_matrix = sns.pairplot(data[wind_columns], markers='o', hue='GHI', palette='viridis')
plt.suptitle('Scatter Matrix for Wind Conditions and Solar Irradiance', y=1.02)
plt.show()

In [None]:
#Wind Analysis

wind_speed = data['WS']  
wind_direction = data['WD'] 

#Create Polar Plot for Wind Direction and Speed
plt.figure(figsize=(10, 8))

# Create a polar subplot
ax = plt.subplot(111, polar=True)

# Convert wind direction from degrees to radians
wind_direction_rad = np.deg2rad(wind_direction)

# Create a scatter plot
sc = ax.scatter(wind_direction_rad, wind_speed, c=wind_speed, cmap='viridis', alpha=0.75, edgecolors='k')

# Add color bar
plt.colorbar(sc, label='Wind Speed (m/s)')

# Set plot title and labels
ax.set_title('Wind Speed and Direction Distribution', va='bottom')
ax.set_xlabel('Wind Direction (°N (to east))')
ax.set_ylabel('Wind Speed (m/s)')

# Step 6: Analyze Wind Direction Variability
plt.figure(figsize=(10, 6))
sns.histplot(wind_direction, bins=36, kde=False, color='blue', edgecolor='k')
plt.title('Wind Direction Distribution')
plt.xlabel('Wind Direction (°N (to east))')
plt.ylabel('Frequency')
plt.xlim(0, 360)
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
# Visualize Relationships
plt.figure(figsize=(14, 6))

# Scatter plot for RH vs Temperature
plt.subplot(1, 2, 1)
sns.scatterplot(data=data, x='RH', y='TModA', color='blue', label='TModA', alpha=0.7)
sns.scatterplot(data=data, x='RH', y='TModB', color='red', label='TModB', alpha=0.7)
plt.title('Relative Humidity vs Temperature')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.grid()

# Scatter plot for RH vs Solar Radiation
plt.subplot(1, 2, 2)
sns.scatterplot(data=data, x='RH', y='GHI', color='orange', label='GHI', alpha=0.7)
sns.scatterplot(data=data, x='RH', y='DNI', color='green', label='DNI', alpha=0.7)
plt.title('Relative Humidity vs Solar Radiation')
plt.xlabel('Relative Humidity (%)')
plt.ylabel('Solar Radiation (W/m²)')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

#Correlation Analysis
correlation_data = data[['RH', 'TModA', 'TModB', 'GHI', 'DNI']]
correlation_matrix = correlation_data.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix for RH, Temperature, and Solar Radiation')
plt.show()

In [None]:
#Create Histograms
variables = ['GHI', 'DNI', 'DHI', 'WS', 'TModA', 'TModB']  # Add or remove variables as needed

plt.figure(figsize=(15, 10))

for i, variable in enumerate(variables, start=1):
    plt.subplot(2, 3, i)  # Arrange plots in a 2x3 grid
    sns.histplot(data[variable], bins=30, kde=True, color='blue', edgecolor='black')
    plt.title(f'Histogram of {variable}')
    plt.xlabel(variable)
    plt.ylabel('Frequency')
    plt.grid()

plt.tight_layout()
plt.show()

In [None]:
# Calculate Z-scores
# Specify the columns for which you want to calculate Z-scores
variables = ['GHI', 'DNI', 'DHI', 'WS', 'TModA', 'TModB']  # Adjust as necessary

# Calculate Z-scores
z_scores = (data[variables] - data[variables].mean()) / data[variables].std()

#  Flag Outliers
threshold = 3  # Common threshold for Z-scores
outliers = (z_scores.abs() > threshold)

# Adding outlier flag to the original DataFrame
for variable in variables:
    data[f'{variable}_outlier'] = outliers[variable]

# Display results
outlier_summary = data[[*variables, *[f'{var}_outlier' for var in variables]]]
print(outlier_summary.head())

In [None]:
#Create a Bubble Chart
plt.figure(figsize=(12, 8))

# Define the variables
x = 'GHI'  # X-axis variable
y = 'Tamb'  # Y-axis variable
size = 'RH'  # Bubble size variable (can also use 'BP' for Barometric Pressure)

# Create the bubble chart
scatter = plt.scatter(data[x], data[y], 
                      s=data[size] * 2,  # Scale bubble size (adjust multiplier as needed)
                      alpha=0.5, 
                      c=data[size], 
                      cmap='viridis', 
                      edgecolors='w')

# Add titles and labels
plt.title(f'Bubble Chart: {x} vs {y} with Bubble Size = {size}', fontsize=16)
plt.xlabel(x, fontsize=14)
plt.ylabel(y, fontsize=14)

# Add a color bar
plt.colorbar(scatter, label=size)

plt.grid()
plt.show()

In [None]:
# Clean the Data

#Drop columns that are entirely null
columns_to_drop = data.columns[data.isnull().all()]
data.drop(columns=columns_to_drop, inplace=True)

#Handling missing values for numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
for column in numeric_columns:
    data[column].fillna(data[column].mean(), inplace=True)  # Fill with mean

#Handling missing values for object columns (e.g., Comments)
object_columns = data.select_dtypes(include=['object']).columns
for column in object_columns:
    data[column].fillna('No Comment', inplace=True)  # Replace with a default value

#Addressing anomalies (Example: Replace negative values in columns that shouldn't have them)
# Adjust the column names as necessary
data.loc[data['GHI'] < 0, 'GHI'] = 0  # Example for GHI
data.loc[data['DNI'] < 0, 'DNI'] = 0  # Example for DNI
data.loc[data['Tamb'] < -30, 'Tamb'] = None  # Example for Tamb, replace unrealistic values

# Verify cleaning
print(data.info())  # Check data types and non-null counts after cleaning
print(data.head())  # Display the first few rows of the cleaned data