Summary Statistics

In [None]:
# Importing necessary libraries
import pandas as pd

# Load the dataset
df_benin = pd.read_csv('benin-malanville.csv')
df_sierraleone = pd.read_csv('sierraleone-bumbuna.csv')
df_togo = pd.read_csv('togo-dapaong_qc.csv')



In [None]:
# Summary statistics
df_benin.describe()

In [None]:
# Summary statistics
df_sierraleone.describe()

In [None]:
# Summary statistics
df_togo.describe()

Data Quality Check

In [None]:
def quality_check(df):
  # Check for missing values
  missing_values = df.isnull().sum()
  print("Missing Values:")
  print(missing_values)

  # Check for negative values in columns that should only have positive values
  numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
  negative_values = {col: df[df[col] < 0].shape[0] for col in numeric_cols}
  print("Negative Values Count:")
  print(negative_values)

  # Check for outliers using the IQR method
  Q1 = df[numeric_cols].quantile(0.25)
  Q3 = df[numeric_cols].quantile(0.75)
  IQR = Q3 - Q1
  outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
  print("Outliers Count:")
  print(outliers)


In [None]:
quality_check(df_benin)

In [None]:
quality_check(df_sierraleone)

In [None]:
quality_check(df_togo)

Time Series Analysis

In [None]:
import matplotlib.pyplot as plt

def time_analysis(df):
  # Convert Timestamp column to datetime
  df['Timestamp'] = pd.to_datetime(df['Timestamp'])

  # Plot GHI, DNI, DHI, and Tamb over time
  fig, ax = plt.subplots(4, 1, figsize=(12, 16))
  variables = ['GHI', 'DNI', 'DHI', 'Tamb']

  for i, var in enumerate(variables):
      ax[i].plot(df['Timestamp'], df[var])
      ax[i].set_title(f'{var} Over Time')
      ax[i].set_xlabel('Time')
      ax[i].set_ylabel(var)

  plt.tight_layout()
  plt.show()


In [None]:
time_analysis(df_benin)

In [None]:
time_analysis(df_sierraleone)

In [None]:
time_analysis(df_togo)

Evaluating the impact of cleaning

In [None]:
import seaborn as sns

def cleaning_impact(df):

  # Boxplot to compare sensor readings before and after cleaning
  fig, ax = plt.subplots(1, 2, figsize=(12, 6))
  sns.boxplot(x='Cleaning', y='ModA', data=df, ax=ax[0])
  sns.boxplot(x='Cleaning', y='ModB', data=df, ax=ax[1])

  ax[0].set_title('Impact of Cleaning on ModA')
  ax[1].set_title('Impact of Cleaning on ModB')

  plt.show()


In [None]:
cleaning_impact(df_benin)

In [None]:
cleaning_impact(df_sierraleone)

In [None]:
cleaning_impact(df_togo)

Correlation Analysis

In [None]:
def corr_analysis(df):
  # Correlation matrix
  correlation_matrix = df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'WS', 'WSgust']].corr()
  sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
  plt.title('Correlation Matrix')
  plt.show()

  # Scatter plot matrix
  sns.pairplot(df[['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'WS', 'WSgust']])
  plt.show()


In [None]:
corr_analysis(df_benin)

In [None]:
corr_analysis(df_sierraleone)

In [None]:
corr_analysis(df_togo)

Temprature Analysis

In [None]:
def temp_analysis(df):
  # Scatter plot of RH vs Tamb and GHI
  fig, ax = plt.subplots(1, 2, figsize=(12, 6))

  sns.scatterplot(x='RH', y='Tamb', data=df, ax=ax[0])
  ax[0].set_title('Relative Humidity vs. Ambient Temperature')

  sns.scatterplot(x='RH', y='GHI', data=df, ax=ax[1])
  ax[1].set_title('Relative Humidity vs. Global Horizontal Irradiance')

  plt.tight_layout()
  plt.show()


In [None]:
temp_analysis(df_benin)

In [None]:
temp_analysis(df_sierraleone)

In [None]:
temp_analysis(df_togo)

Histograms

In [None]:
# Create histograms for selected variables
def histogram(df):
  variables = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']
  df[variables].hist(bins=30, figsize=(12, 8))
  plt.suptitle('Histograms of Selected Variables', y=0.95)
  plt.show()

In [None]:
histogram(df_benin)

In [None]:
histogram(df_sierraleone)

In [None]:
histogram(df_togo)

Z-score Analysis

In [None]:
from scipy.stats import zscore

def z_score(df):
  # Calculate Z-scores
  df['GHI_zscore'] = zscore(df['GHI'])
  df['DNI_zscore'] = zscore(df['DNI'])

  # Flag outliers
  outliers = df[(df['GHI_zscore'].abs() > 3) | (df['DNI_zscore'].abs() > 3)]
  print("Outliers based on Z-score:")
  print(outliers)

In [None]:
z_score(df_benin)

In [None]:
z_score(df_sierraleone)

In [None]:
z_score(df_togo)

Bubble charts

In [None]:
def bubble_chart(df):
  # Bubble chart for GHI vs Tamb vs WS with bubble size as RH
  plt.figure(figsize=(12, 8))
  bubble_size = df['RH'] / 10  # Scale bubble sizes
  plt.scatter(df['GHI'], df['Tamb'], s=bubble_size, alpha=0.5, c=df['WS'], cmap='viridis')
  plt.colorbar(label='Wind Speed (m/s)')
  plt.xlabel('GHI')
  plt.ylabel('Tamb')
  plt.title('Bubble Chart: GHI vs Tamb with Bubble Size as RH')
  plt.show()

In [None]:
bubble_chart(df_benin)

In [None]:
bubble_chart(df_sierraleone)

In [None]:
bubble_chart(df_togo)

Data cleaning

In [None]:
# Handling missing values
def cleaning(df):
  df.fillna(df.median(), inplace=True)  # Replace missing values with median

  # Drop irrelevant columns
  df.drop(columns=['Comments'], inplace=True)

  # Handle negative values
  for col in numeric_cols:
      df[col] = df[col].apply(lambda x: max(x, 0))  # Replace negatives with 0

  print("Data cleaned successfully.")


In [None]:
cleaning(df_benin)

In [None]:
cleaning(df_sierraleone)

In [None]:
cleaning(togo)