In [None]:
# Importing_Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set plot styles
sns.set(style="whitegrid")


In [None]:
#Load_the_Dataset

In [None]:
# Load the dataset
file_path = 'benin-malanville.csv'  # Update this if the file is in another directory
df = pd.read_csv(file_path)

# Preview the dataset
print("First five rows of the dataset:")
display(df.head())

# Check dataset information
print("\nDataset Information:")
print(df.info())


In [None]:
#Generate data summary

In [None]:
# Summary statistics for numeric columns
print("Summary Statistics:")
summary_stats = df.describe()
display(summary_stats)


In [None]:
#Save statistics 

In [None]:
# Save statistics
summary_stats.to_csv('summary_statistics.csv', index=True)
print("Summary statistics saved to 'summary_statistics.csv'")


In [None]:
# Plot histograms for key columns
columns_to_plot = ['GHI', 'DNI', 'DHI', 'Tamb']
for col in columns_to_plot:
    plt.figure(figsize=(8, 5))
    df[col].hist(bins=30, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(False)
    plt.show()


In [None]:
#Check for Data Quality

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
display(missing_values)


In [None]:
#Visualize Missing Values

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
display(missing_values)


In [None]:
#Handle Missing Data

In [None]:
# Example: Fill missing values with column mean
df.fillna(df.mean(), inplace=True)


In [None]:
#Detect Outliers 

In [None]:
# Boxplots for numeric columns
for col in columns_to_plot:
    plt.figure(figsize=(8, 5))
    sns.boxplot(data=df, x=col, color='lightcoral')
    plt.title(f'Boxplot of {col}')
    plt.show()


In [None]:
#Time series analysis

In [None]:
# Convert 'Timestamp' column to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [None]:
# Set the 'Timestamp' column as the index
df.set_index('Timestamp', inplace=True)


In [None]:
# Plot GHI, DNI, and Tamb over time
df[['GHI', 'DNI', 'Tamb']].plot(figsize=(15, 7), title="Time Series of Solar Irradiance and Temperature")
plt.ylabel("Values")
plt.show()


In [None]:
#Correlation Analysis

In [None]:
# Compute correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
display(correlation_matrix)


In [None]:
#Visualize Correlations

In [None]:
# Heatmap for correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()


In [None]:
#Data Cleaning

In [None]:
# Remove negative values in GHI and DNI
df = df[(df['GHI'] >= 0) & (df['DNI'] >= 0)]
print(f"Dataset cleaned. Remaining rows: {len(df)}")


In [None]:
# Plot distributions again
for col in columns_to_plot:
    plt.figure(figsize=(8, 5))
    df[col].hist(bins=30, color='limegreen', edgecolor='black')
    plt.title(f'Distribution of {col} After Cleaning')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.grid(False)
    plt.show()


In [None]:
#Save the cleaned Data

In [None]:
cleaned_file_path = 'benin-malanville-cleaned.csv'
df.to_csv(cleaned_file_path, index=True)
print(f"Cleaned dataset saved to {cleaned_file_path}")
