In [1]:
import random
import hashlib
import random
from tabulate import tabulate
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


# Set the random seed so reruns are deterministic
random.seed(42)

In [None]:
#########
# Task 1
#########
def generate_sensor_id():
    """
      Generate a hashed, unique and random sensor id string
    """
    id_to_hash = f"Sensor_id_{random.random()}"
    id_hashed = hashlib.sha256(id_to_hash.encode())
    return id_hashed.hexdigest()

def sensor_id_valid(sensor_id, all_sensor_ids):
    """
      A sensor id is valid if it is unique in all sensor ids (ie its can only be counted once)
    """
    if all_sensor_ids.count(sensor_id) == 1:
        return True
    else:
        return False

def temperature_reading_valid(temperature_reading):
    """
      Validate the temperature reading is between -10 and 50
    """
    if temperature_reading >= -10 and temperature_reading <= 50:
        return True
    return False

def humidity_level_valid(humidity_level):
    """
      Validate that the humidity level is between 10% and 100%
    """
    if humidity_level >= 10 and humidity_level <= 100:
        return True
    return False


# Arrays to hold sensor data
Sensor_IDs = []
Temperature_Readings = []
Humidity_Levels = []

# Generate 5 random sensor ids, temperature readings and humidity levels
for i in range(5):
    Sensor_IDs.append(generate_sensor_id())
    Temperature_Readings.append(random.randint(-10, 50))
    Humidity_Levels.append(random.randint(10, 100))

# Validate and print all data
for i in range(5):
    sensor_id = Sensor_IDs[i]
    temperature_reading = Temperature_Readings[i]
    humidity_level = Humidity_Levels[i]
    print(f"Sensor_id={sensor_id}, temperature_reading={temperature_reading}, humidity_level={humidity_level}")
    all_data_valid = True
    if not sensor_id_valid(sensor_id, Sensor_IDs):
        print("Sensor_id is not unique")
        all_data_valid = False
    if not temperature_reading_valid(temperature_reading):
        print("Temperature reading is invalid")
        all_data_valid = False
    if not humidity_level_valid(humidity_level):
        print("Humidity level not valid")
        all_data_valid = False

    if all_data_valid:
        print("All entries are valid")
    print ("---------------------------------------")


In [None]:
#########
# Task 2
#########


def create_sensor_data_dict(sensor_ids, temperature_readings, humidity_levels):
    """
      Create a dictionary from sensor data. Throws exception if data is any data points are invalid
    """
    sensor_data = {}
    for sensor_id, temp, humidity in zip(sensor_ids, temperature_readings, humidity_levels):
        if sensor_id and temperature_reading_valid(temp) and humidity_level_valid(humidity):
            sensor_data[sensor_id] = {
                'Temperature_Reading': temp,
                'Humidity_Level': humidity
            }
        else:
            raise Exception("Invalid sensor data: sensor_id, temperature_reading, or humidity_level is invalid.")
    return sensor_data

def filter_and_sort_sensors(sensor_data, temp_threshold):
    """
      Filters data exceeding temp_threshold, sorts by temperature descending and prints top three readings
    """
    filtered_sensors = {k: v for k, v in sensor_data.items() if v['Temperature_Reading'] > temp_threshold}
    sorted_sensors = dict(sorted(filtered_sensors.items(), key=lambda item: item[1]['Temperature_Reading'], reverse=True))
    
    top_3 = list(sorted_sensors.items())[:3]
    for sensor_id, data in top_3:
        print(f"Sensor_ID: {sensor_id}, Temperature: {data['Temperature_Reading']}, Humidity: {data['Humidity_Level']}")


print("Sensor data where temperature is greater than 0")
sensor_dict = create_sensor_data_dict(Sensor_IDs, Temperature_Readings, Humidity_Levels)
filter_and_sort_sensors(sensor_dict, 0)



In [None]:
#########
# Task 3
#########

air_quality = None
GOOD = 'Good'
MODERATE = 'Moderate'   
UNHEALTHY = 'Unhealthy'
HAZARDOUS = 'Hazardous'
air_quality_categories = [GOOD, MODERATE, UNHEALTHY, HAZARDOUS]

def recommend_activity(air_quality):
    activities = {
        GOOD: ['hiking', 'biking', 'swimming'],
        MODERATE: ['walking', 'cycling', 'jogging'],
        UNHEALTHY: ['watch movie', 'cooking', 'board games'],
        HAZARDOUS: ['stay indoors', 'visit shopping centre']
    }
    return random.choice(activities.get(air_quality, ["Invalid air quality"]))

def get_air_quality():
    while True:
        air_quality = input("Enter air quality (Good, Moderate, Unhealthy, Hazardous): ")
        if air_quality in air_quality_categories:
            return air_quality
        else:
            print("Invalid input. Please enter a valid air quality.")

def simulate_air_quality_readings():
    readings = [random.choice(air_quality_categories) for _ in range(10)]
    counts = {category: readings.count(category) for category in air_quality_categories}
    percentages = {category: (count / len(readings)) * 100 for category, count in counts.items()}
    return counts, percentages

def print_results(counts, percentages):
    results = [[category, counts[category], f"{percentages[category]:.2f}%"] for category in air_quality_categories]
    print(tabulate(results, headers=['Air Quality', 'Count', 'Percentage'], tablefmt='orgtbl'))


print("\nSimulated Air Quality Readings:")
counts, percentages = simulate_air_quality_readings()
print_results(counts, percentages)

print("\nGet Air Quality Recommendation:")
air_quality_input = get_air_quality()
print(f"Recommended activity: {recommend_activity(air_quality_input)}")





In [None]:
#########
# Task 4
#########

even_numbers = []
odd_numbers = []
seen_numbers = set()

while True:
    try:
        num = int(input("Enter a number (negative number to stop): "))
        
        if num < 0:
            break
        
        if num in seen_numbers:
            print("You've already entered this number. Try a different one.")
            continue
        
        seen_numbers.add(num)
        
        if num % 2 == 0:
            even_numbers.append(num)
        else:
            odd_numbers.append(num)

    except ValueError:
        print("Invalid input. Please enter a valid number.")

with open("loop_log.txt", "w") as file:
    file.write(f"Even numbers: {even_numbers}\n")
    file.write(f"Odd numbers: {odd_numbers}\n")

print("Numbers have been logged in 'loop_log.txt'.")


In [None]:
#########
# Task 5
#########

def print_character_positions(term):
    even_positions = ""
    odd_positions = ""
    for i in range(len(term)):
        if i % 2 == 0:
            even_positions += term[i]
            odd_positions += " "
        else:
            odd_positions += term[i]
            even_positions += " "
    print("Characters at even positions: ", even_positions)
    print("Characters at  odd positions: ", odd_positions)

term = input("Enter a scientific term: ")
print_character_positions(term)



In [None]:
#########
# Task 6
#########

WIND_SPEED_COLUMN = 'Wind_Speed_kmh'

weather_data = pd.read_csv('WeatherData.csv')

# There is a superfluous unnamed column, so we drop it
weather_data.drop(columns=['Unnamed: 6'], inplace=True)

# Display row and columns counts
print(f"Number of weather stations: {weather_data.shape[0]}")
print(f"Number of recorded parameters: {weather_data.shape[1]}")

# mean wind speed
mean_wind_speed = weather_data[WIND_SPEED_COLUMN].mean()

# modify wind speed
weather_data['Modified Wind Speed'] = weather_data[WIND_SPEED_COLUMN] - mean_wind_speed

# Print both the original and modified values
print("Original Wind Speed Values:")
print(weather_data[WIND_SPEED_COLUMN])
print("\nModified Wind Speed Values:")
print(weather_data['Modified Wind Speed'])



In [None]:
#########
# Task 7
#########


weather_data = pd.read_csv('WeatherData.csv')

# There is a superfluous unnamed column, so we drop it
weather_data.drop(columns=['Unnamed: 6'], inplace=True)

print("Dataset Information:")
print(weather_data.info())

# Investigate missing values
print("\nMissing Values Count:")
missing_values = weather_data.isnull().sum()
print(missing_values)

# Identify numeric and categorical columns
numeric_columns = ['Temperature_C', 'Humidity_%', 'Wind_Speed_kmh', 'Precipitation_mm']

# Make copies for different imputation methods
weather_data_median = weather_data.copy()
weather_data_mode = weather_data.copy()

###
# Justification for imputation techniques
#
# For the numeric columns, we used median imputation because it is robust to outliers and preserves the overall distribution.
# For the categorical columns, we used mode imputation because it is a simple and effective way to impute missing values.
###

# Justification for imputation techniques
print("\nJustification for Imputation Techniques:")
print("1. Median Imputation: Median imputation is robust to outliers and preserves the overall distribution.")
print("2. Mode Imputation: We used mode imputation because it is a simple and effective way to impute missing values.")



plt.figure(figsize=(15, 12))

# Create plots
for i, column in enumerate(numeric_columns):
    if missing_values[column] > 0:
        # This column has missing values

        # Original data distribution
        plt.subplot(4, 3, 3*i+1)
        sns.histplot(weather_data[column].dropna(), kde=True)
        plt.title(f'Original {column} Distribution')
        
        # Technique 1: Median Imputation 
        median_imputer = SimpleImputer(strategy='median')
        weather_data_median[column] = median_imputer.fit_transform(weather_data_median[[column]])
        
        plt.subplot(4, 3, 3*i+2)
        sns.histplot(weather_data_median[column], kde=True)
        plt.title(f'{column} with Median Imputation Distribution')
        
        # Technique 2: Mode Imputation
        mode_imputer = SimpleImputer(strategy='most_frequent')
        weather_data_mode[column] = mode_imputer.fit_transform(weather_data_mode[[column]])
        
        plt.subplot(4, 3, 3*i+3)
        sns.histplot(weather_data_mode[column], kde=True)
        plt.title(f'{column} with Mode Imputation Distribution')
        
        print(f"\nImputation for {column}:")
        print(f"Number of missing values: {missing_values[column]}")
        print(f"Median value used for imputation: {weather_data[column].median()}")
        print(f"Mode value used for imputation: {weather_data[column].mode()[0]}")


plt.tight_layout()
plt.show()


# Distribution comparison

print("\nDistribution Comparison:")
print("1. Temperature_C: The original distribution has a more normal distribution, while the median and mode imputed distributions have a distribution with a spike at the median value.")
print("2. Humidity_%: The original and median imputed distributions are similar, but the mode imputed distribution has a spike at the mode value.")


# Save the median imputed dataset
weather_data_median.to_csv('WeatherData_Cleaned.csv', index=False)




In [None]:
#########
# Task 8
#########

# Perform one-hot encoding on the Weather_Condition column
encoded_weather = pd.get_dummies(weather_data['Weather_Condition'], prefix='Weather')
print("\nEncoded weather conditions:")
print(encoded_weather.head())

weather_data_encoded = weather_data.copy()
weather_data_encoded = pd.concat([weather_data_encoded, encoded_weather], axis=1)

# Drop the original Weather_Condition column
weather_data_encoded = weather_data_encoded.drop('Weather_Condition', axis=1)

print("\nDataset with encoded weather conditions:")
print(weather_data_encoded.head())

print("\nJustification for using one-hot encoding:")
print("1. Weather condition (Sunny, Rainy, Cloudy, Stormy) is categorical and has no natural ordering so one-hot encoding allows each weather condition to have an independent effect on the model.")
print("2. As there are only 4 categories, the dimensionality increase is manageable")

weather_data_encoded.to_csv('WeatherData_Encoded.csv', index=False)





In [None]:
#########
# Task 9
#########

weather_data_scaled = weather_data_encoded.copy()
numerical_cols = ['Temperature_C', 'Humidity_%', 'Wind_Speed_kmh', 'Precipitation_mm']

scaler = MinMaxScaler()

# Apply Min-Max scaling to the numerical columns
weather_data_scaled[numerical_cols] = scaler.fit_transform(weather_data_scaled[numerical_cols])

print("\nDataset after Min-Max scaling:")
print(weather_data_scaled.head())

plt.figure(figsize=(15, 12))

# Histogram of temperatures before and after scaling
plt.subplot(2, 2, 1)
plt.hist(weather_data_encoded['Temperature_C'], bins=15, alpha=0.7, color='blue')
plt.title('Temperature Distribution Before Scaling')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
plt.hist(weather_data_scaled['Temperature_C'], bins=15, alpha=0.7, color='green')
plt.title('Temperature Distribution After Min-Max Scaling')
plt.xlabel('Scaled Temperature (0-1)')
plt.ylabel('Frequency')

# Boxplot of wind speed before and after scaling
plt.subplot(2, 2, 3)
plt.boxplot(weather_data_encoded['Wind_Speed_kmh'])
plt.title('Wind Speed Distribution Before Scaling')
plt.ylabel('Wind Speed (km/h)')

plt.subplot(2, 2, 4)
plt.boxplot(weather_data_scaled['Wind_Speed_kmh'])
plt.title('Wind Speed Distribution After Min-Max Scaling')
plt.ylabel('Scaled Wind Speed (0-1)')

plt.tight_layout()
plt.show()

# Analysis
print("\nAnalysis of Min-Max Scaling Effects:")
print("1. Range Transformation: All numerical features now have the same range, making them comparable. That is, the minimum value is 0 and the maximum value is 1.")
print("2. Distribution Preservation: There is no change to the shape of the distribution.")

