In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.express as px

In [None]:
# Load data
data = pd.read_csv("F:\KAIM-WEEK0\data")

In [None]:
# 1. Summary Statistics
summary_stats = data.describe()
print("Summary Statistics:")
print(summary_stats)

In [None]:
# 2. Data Quality Check
missing_values = data.isnull().sum()
print("\nMissing Values:")
print(missing_values)

In [None]:
# Check for negative values in GHI, DNI, DHI
negative_values = data[(data['GHI'] < 0) | (data['DNI'] < 0) | (data['DHI'] < 0)]
print("\nNegative Values:")
print(negative_values)

In [None]:
# 3. Outlier Detection (Z-score)
columns_to_check = ['GHI', 'DNI', 'DHI', 'WS', 'WSgust', 'ModA', 'ModB']
data['Z_Score_Outliers'] = data[columns_to_check].apply(lambda x: np.abs(zscore(x)), axis=0).max(axis=1)
outliers = data[data['Z_Score_Outliers'] > 3]
print("\nOutliers Detected:")
print(outliers)

In [None]:
# 4. Time Series Analysis
# Plot GHI, DNI, DHI, and Tamb over time
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data[['GHI', 'DNI', 'DHI', 'Tamb']].plot(figsize=(14, 7), title="Time Series Analysis")
plt.show()

In [None]:
# 5. Evaluate the Impact of Cleaning
cleaned_data = data[data['Cleaning'] == 'Yes']
uncleaned_data = data[data['Cleaning'] == 'No']

plt.figure(figsize=(10, 5))
plt.plot(cleaned_data.index, cleaned_data['ModA'], label='Cleaned - ModA')
plt.plot(uncleaned_data.index, uncleaned_data['ModA'], label='Uncleaned - ModA', alpha=0.6)
plt.legend()
plt.title('Impact of Cleaning on Sensor Readings (ModA)')
plt.show()

In [None]:
# 6. Correlation Analysis
correlation_matrix = data[['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'WS', 'WSgust', 'WD']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# 7. Wind Analysis
# Using Wind Roses
import matplotlib.cm as cm
from windrose import WindroseAxes

def plot_wind_rose(speed, direction):
    ax = WindroseAxes.from_ax()
    ax.bar(direction, speed, normed=True, opening=0.8, edgecolor='white', cmap=cm.viridis)
    ax.set_legend()
    plt.show()

plot_wind_rose(data['WS'], data['WD'])

In [None]:
# 8. Temperature and Humidity Analysis
sns.scatterplot(x='RH', y='Tamb', data=data)
plt.title('Relative Humidity vs Temperature')
plt.show()

In [None]:
# 9. Histograms
hist_columns = ['GHI', 'DNI', 'DHI', 'WS', 'Tamb']
data[hist_columns].hist(bins=20, figsize=(14, 10), edgecolor='black')
plt.suptitle('Histograms of Variables')
plt.show()

In [None]:
# 10. Z-Score Analysis
z_scores = data[columns_to_check].apply(zscore)
z_outliers = (z_scores > 3).sum()
print("\nZ-Score Outlier Counts:")
print(z_outliers)

In [None]:
# 11. Bubble Chart
fig = px.scatter(data, x='GHI', y='Tamb', size='RH', color='WS', hover_data=['BP'], title="Bubble Chart: GHI vs Tamb vs WS")
fig.show()

In [None]:
# 12. Data Cleaning
# Remove rows with missing values or impute them if needed
cleaned_data = data.dropna()

# Example: Handling comments column
cleaned_data = cleaned_data.drop(columns=['Comments'])

print("\nCleaned Data Sample:")
print(cleaned_data.head())