In [3]:
#import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose

In [4]:
#Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#Look through all folders in the "Project" folder to access dataset
folder_path = '/content/drive/MyDrive/Project'

data = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith('.txt'):
            file_path = os.path.join(root, filename)
            date = os.path.splitext(filename)[0]  # Extract date from filename
            try:
                df = pd.read_csv(file_path, delimiter='\s+', skiprows=[0, 2], encoding='latin1')
                df = df.iloc[:, :16]
                df.columns = ['Time', 'Wind Dir', 'Wind Spd', 'Hum In', 'Humidity', 'Temp In', 'Temp', 'Raw Barom', 'Temp Ch1', 'Hum Ch1', 'Temp Ch2', 'Hum Ch2', 'UV', 'Solar Radiation', 'Dew Point', 'Rain Rate']
                df.insert(0, 'Date', pd.to_datetime(date, format='%y%m%d').strftime('%d-%m-%y'))

                data.append(df)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

Error processing file /content/drive/MyDrive/Project/Dataset_3/120825.txt: No columns to parse from file
Error processing file /content/drive/MyDrive/Project/Dataset_3/121220.txt: Error tokenizing data. C error: Expected 30 fields in line 689, saw 31



In [None]:
# Concatenate all data into a single DataFrame
data = pd.concat(data, ignore_index=True)

# Convert 'date' and 'Time' columns to datetime
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y')
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M').dt.time
data.sort_values(by=['Date', 'Time'], inplace=True)
data.set_index('Date', inplace=True)

print(data.head())

In [None]:
cutoff = pd.to_datetime('2012-12-20')

data1 = data.loc[:cutoff]
data2 = data.loc[cutoff:]

data2 = data2.iloc[:, :13]
data2.columns = ['Time', 'Wind Dir', 'Wind Spd', 'Hum In', 'Humidity', 'Temp In', 'Temp', 'Raw Barom', 'UV', 'Solar Radiation', 'Dew Point', 'Daily Rain', 'Rain Rate']
data2

In [None]:
pd.set_option('display.max_info_columns', 100)
pd.set_option('display.max_info_rows', 10000000)
data1.info(verbose=True)

In [None]:
data2.info(verbose=True)

In [None]:
data1.describe()

In [None]:
data2.describe()

In [None]:
numeric_data = data2.select_dtypes(include='number')
corr_matrix = numeric_data.corr()

plt.figure(figsize=(12, 8))

sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)

plt.title('Correlation Heatmap')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
#Create a copy of data2 for Time-series analysis
data2_copy = data2.copy()
data2_copy.index = pd.to_datetime(data2_copy.index, format='%Y-%m-%d')
data2_copy['Time'] = pd.to_datetime(data2_copy['Time'], format='%H:%M:%S').dt.time

date_str = data2_copy.index.date.astype(str)  # Convert index dates to string
time_str = data2_copy['Time'].astype(str)     # Convert 'Time' to string
datetime_str = [f"{date} {time}" for date, time in zip(date_str, time_str)]

data2_copy.index = pd.to_datetime(datetime_str)

# Drop the old 'Time' column
data2_copy.drop(columns=['Time'], inplace=True)

In [None]:
def time_series_analysis(data2_copy, column_name):
    # Check if column exists in the dataframe
    if column_name not in data2_copy.columns:
        raise ValueError(f"Column '{column_name}' not found in the dataframe.")

    # Plot the time series data
    plt.figure(figsize=(14, 7))
    plt.plot(data2_copy.index, data2_copy[column_name], label=column_name)
    plt.title(f'Time Series Plot of {column_name}')
    plt.xlabel('Date')
    plt.ylabel(column_name)
    plt.legend()
    plt.grid(True)
    plt.show()

    # Resample to daily frequency and plot
    data2_daily = data2_copy.resample('D').mean()
    plt.figure(figsize=(14, 7))
    plt.plot(data2_daily.index, data2_daily[column_name], label=f'Daily Average {column_name}')
    plt.title(f'Daily Average Time Series Plot of {column_name}')
    plt.xlabel('Date')
    plt.ylabel(column_name)
    plt.legend()
    plt.grid(True)
    plt.show()

    # Seasonal Decomposition
    decomposition = seasonal_decompose(data2_daily[column_name].dropna(), model='additive', period = 365)
    plt.figure(figsize=(14, 10))
    plt.subplot(4, 1, 1)
    plt.plot(decomposition.observed)
    plt.title('Observed')

    plt.subplot(4, 1, 2)
    plt.plot(decomposition.trend)
    plt.title('Trend')

    plt.subplot(4, 1, 3)
    plt.plot(decomposition.seasonal)
    plt.title('Seasonal')

    plt.subplot(4, 1, 4)
    plt.plot(decomposition.resid)
    plt.title('Residual')

    plt.tight_layout()
    plt.show()

    # Rolling Statistics
    rolling_mean = data2_copy[column_name].rolling(window=30).mean()
    rolling_std = data2_copy[column_name].rolling(window=30).std()

    plt.figure(figsize=(14, 7))
    plt.plot(data2_copy.index, data2_copy[column_name], label='Original')
    plt.plot(data2_copy.index, rolling_mean, label='Rolling Mean', color='orange')
    plt.plot(data2_copy.index, rolling_std, label='Rolling Std', color='red')
    plt.title(f'Rolling Statistics of {column_name}')
    plt.xlabel('Date')
    plt.ylabel(column_name)
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage
time_series_analysis(data2_copy, 'Temp')
time_series_analysis(data2_copy, 'Rain Rate')

In [None]:
#might want to do this for the daily data that has been averaged
rolling_mean = data2_copy['Temp'].rolling(window=30).mean()
rolling_std = data2_copy['Temp'].rolling(window=30).std()
print(rolling_mean)
print(rolling_std)

In [None]:
data2_copy

In [None]:
#Creating a subset of the data with Temp, Rain rate, Humidity, Solar radiation and wind speed
climate_data = data2[['Time', 'Temp', 'Rain Rate', 'Humidity', 'Solar Radiation', 'Wind Spd']]
climate_data

In [None]:
numeric_columns = climate_data.select_dtypes(include='number').columns
climate_data_numeric = climate_data[numeric_columns]

daily_average = climate_data_numeric.groupby(climate_data.index).mean()
daily_average