Initializes libraries (pandas, numpy, matplotlib, and seaborn)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import metrics
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
from scipy import stats

Loads data from a GitHub URL.

In [2]:
url='https://raw.githubusercontent.com/nahdes/TenxWeek0/refs/heads/main/notebooks/togo-dapaong_qc.csv'
data_frame = pd.read_csv(url)

Displaying  the shape of the dataset

In [3]:
data_frame.shape


(525600, 19)

Displays the first few rows of the dataset.



In [4]:
data_frame.head


<bound method NDFrame.head of                Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0       2021-10-25 00:01 -1.3  0.0  0.0   0.0   0.0  24.8  94.5  0.9     1.1   
1       2021-10-25 00:02 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.1     1.6   
2       2021-10-25 00:03 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.2     1.4   
3       2021-10-25 00:04 -1.2  0.0  0.0   0.0   0.0  24.8  94.3  1.2     1.6   
4       2021-10-25 00:05 -1.2  0.0  0.0   0.0   0.0  24.8  94.0  1.3     1.6   
...                  ...  ...  ...  ...   ...   ...   ...   ...  ...     ...   
525595  2022-10-24 23:56 -0.8  0.0  0.0   0.0   0.0  25.2  53.8  0.0     0.0   
525596  2022-10-24 23:57 -0.9  0.0  0.0   0.0   0.0  25.3  53.5  0.0     0.0   
525597  2022-10-24 23:58 -1.0  0.0  0.0   0.0   0.0  25.3  53.4  0.0     0.0   
525598  2022-10-24 23:59 -1.1  0.0  0.0   0.0   0.0  25.4  53.5  0.0     0.0   
525599  2022-10-25 00:00 -1.2  0.0  0.0   0.0   0.0  25.4  52.3  0.0     0.0   

        W

Displays the data types of the columns

In [5]:
data_frame.dtypes

Timestamp         object
GHI              float64
DNI              float64
DHI              float64
ModA             float64
ModB             float64
Tamb             float64
RH               float64
WS               float64
WSgust           float64
WSstdev          float64
WD               float64
WDstdev          float64
BP                 int64
Cleaning           int64
Precipitation    float64
TModA            float64
TModB            float64
Comments         float64
dtype: object

Listing  all column names.

In [6]:
data_frame.columns

Index(['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS',
       'WSgust', 'WSstdev', 'WD', 'WDstdev', 'BP', 'Cleaning', 'Precipitation',
       'TModA', 'TModB', 'Comments'],
      dtype='object')

Computes summary statistics for numeric columns

In [7]:
summary_stats = data_frame.describe()  # Summary of all numeric columns
print(summary_stats)

                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      230.555040     151.258469     116.444352     226.144375   
std       322.532347     250.956962     156.520714     317.346938   
min       -12.700000       0.000000       0.000000       0.000000   
25%        -2.200000       0.000000       0.000000       0.000000   
50%         2.100000       0.000000       2.500000       4.400000   
75%       442.400000     246.400000     215.700000     422.525000   
max      1424.000000    1004.500000     805.700000    1380.000000   

                ModB           Tamb             RH             WS  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      219.568588      27.751788      55.013160       2.368093   
std       307.932510       4.758023      28.778732       1.462668   
min         0.000000      14.900000       3.300000       0.000000   
25%         0.000000      24.2000

Filters numeric data columns.

In [8]:
# Select only numeric columns
numeric_data = data_frame.select_dtypes(include=['number'])

# Calculate median
median_values = numeric_data.median()
print("Median Values:\n", median_values)

Median Values:
 GHI                2.1
DNI                0.0
DHI                2.5
ModA               4.4
ModB               4.3
Tamb              27.2
RH                59.3
WS                 2.2
WSgust             2.9
WSstdev            0.5
WD               199.1
WDstdev           10.8
BP               976.0
Cleaning           0.0
Precipitation      0.0
TModA             28.4
TModB             28.4
Comments           NaN
dtype: float64


Selects specific columns for further analysis

In [9]:
df = data_frame[['Timestamp', 'GHI', 'DNI', 'DHI',
            'ModA', 'ModB', 
            'Tamb', 'RH', 'WS', 'WSgust', 'WSstdev','Cleaning','WD',
            #'WDstdev', 'BP', 'Comments',
            'Precipitation','TModA', 'TModB',]].copy()

Displays selected column names.

In [10]:
df.columns

Index(['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'Tamb', 'RH', 'WS',
       'WSgust', 'WSstdev', 'Cleaning', 'WD', 'Precipitation', 'TModA',
       'TModB'],
      dtype='object')

Converts the Timestamp column to datetime format.

In [11]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

Prints dataset information (info()), including data types and missing values.

In [12]:
print(data_frame.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525600 entries, 0 to 525599
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Timestamp      525600 non-null  object 
 1   GHI            525600 non-null  float64
 2   DNI            525600 non-null  float64
 3   DHI            525600 non-null  float64
 4   ModA           525600 non-null  float64
 5   ModB           525600 non-null  float64
 6   Tamb           525600 non-null  float64
 7   RH             525600 non-null  float64
 8   WS             525600 non-null  float64
 9   WSgust         525600 non-null  float64
 10  WSstdev        525600 non-null  float64
 11  WD             525600 non-null  float64
 12  WDstdev        525600 non-null  float64
 13  BP             525600 non-null  int64  
 14  Cleaning       525600 non-null  int64  
 15  Precipitation  525600 non-null  float64
 16  TModA          525600 non-null  float64
 17  TModB          525600 non-nul

Identifies duplicated rows in the dataframe.

In [13]:
df.loc[df.duplicated()]

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,Cleaning,WD,Precipitation,TModA,TModB


Displays a detailed summary of the dataset using describe().

In [14]:
df.describe()

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,Cleaning,WD,Precipitation,TModA,TModB
count,525600,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0
mean,2022-04-25 12:00:30.000000768,230.55504,151.258469,116.444352,226.144375,219.568588,27.751788,55.01316,2.368093,3.22949,0.55774,0.000535,161.741845,0.001382,32.444403,33.54333
min,2021-10-25 00:01:00,-12.7,0.0,0.0,0.0,0.0,14.9,3.3,0.0,0.0,0.0,0.0,0.0,0.0,13.1,13.1
25%,2022-01-24 06:00:45,-2.2,0.0,0.0,0.0,0.0,24.2,26.5,1.4,1.9,0.4,0.0,74.8,0.0,23.9,23.6
50%,2022-04-25 12:00:30,2.1,0.0,2.5,4.4,4.3,27.2,59.3,2.2,2.9,0.5,0.0,199.1,0.0,28.4,28.4
75%,2022-07-25 18:00:15,442.4,246.4,215.7,422.525,411.0,31.1,80.8,3.2,4.4,0.7,0.0,233.5,0.0,40.6,43.0
max,2022-10-25 00:00:00,1424.0,1004.5,805.7,1380.0,1367.0,41.4,99.8,16.1,23.1,4.7,1.0,360.0,2.3,70.4,94.6
std,,322.532347,250.956962,156.520714,317.346938,307.93251,4.758023,28.778732,1.462668,1.882565,0.268923,0.023116,91.877217,0.02635,10.998334,12.769277


Calculates z-scores for specific columns and identifies potential outliers.


In [15]:
from scipy.stats import zscore

# Calculate Z-scores for GHI, DNI, DHI
z_scores = data_frame[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].apply(zscore)
outliers = (z_scores.abs() > 3).sum()  # Count outliers
print("Outliers:\n", outliers)

Outliers:
 GHI        305
DNI       1062
DHI       3415
ModA       137
ModB       206
WS        3510
WSgust    3915
dtype: int64


Checks for invalid entries in GHI, DNI, and DHI columns where values are less than zero

In [16]:
invalid_entries = data_frame[(data_frame['GHI'] < 0) | (data_frame['DNI'] < 0) | (data_frame['DHI'] < 0)]
print("Invalid Entries:\n", invalid_entries)

Invalid Entries:
                Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0       2021-10-25 00:01 -1.3  0.0  0.0   0.0   0.0  24.8  94.5  0.9     1.1   
1       2021-10-25 00:02 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.1     1.6   
2       2021-10-25 00:03 -1.3  0.0  0.0   0.0   0.0  24.8  94.4  1.2     1.4   
3       2021-10-25 00:04 -1.2  0.0  0.0   0.0   0.0  24.8  94.3  1.2     1.6   
4       2021-10-25 00:05 -1.2  0.0  0.0   0.0   0.0  24.8  94.0  1.3     1.6   
...                  ...  ...  ...  ...   ...   ...   ...   ...  ...     ...   
525595  2022-10-24 23:56 -0.8  0.0  0.0   0.0   0.0  25.2  53.8  0.0     0.0   
525596  2022-10-24 23:57 -0.9  0.0  0.0   0.0   0.0  25.3  53.5  0.0     0.0   
525597  2022-10-24 23:58 -1.0  0.0  0.0   0.0   0.0  25.3  53.4  0.0     0.0   
525598  2022-10-24 23:59 -1.1  0.0  0.0   0.0   0.0  25.4  53.5  0.0     0.0   
525599  2022-10-25 00:00 -1.2  0.0  0.0   0.0   0.0  25.4  52.3  0.0     0.0   

        WSstdev     W