## Task 1: Dataset Combination

### B. Combine Lab2 D1A with Lab2 D1B without duplicate columns
### C. Combine Lab2 D1A with Lab2 D1C using merge method

In [3]:
import pandas as pd
import numpy as np

# Load datasets
d1a = pd.read_csv("Lab2 D1A.csv")
d1b = pd.read_csv("Lab2 D1B.csv", header=0, names=["name", "population", "county", "longitude", "level1", "enrollment", "level2"])
d1c = pd.read_csv("Lab2 D1C.csv")

# Display info about datasets
print("D1A shape:", d1a.shape)
print("D1A columns:", d1a.columns.tolist())
print("---")
print("D1B shape:", d1b.shape)
print("D1B columns:", d1b.columns.tolist())
print("---")
print("D1C shape:", d1c.shape)
print("D1C columns:", d1c.columns.tolist())

# Task 1B: Combine datasets without duplicate columns
# We'll merge on the common columns to ensure data is aligned correctly.
# The common columns are 'name', 'population', and 'county'.
combined_df = pd.merge(d1a, d1b, on=['name', 'population', 'county'])

# Display info about combined dataset
print("\n--- Task 1B: Combined D1A and D1B ---")
print("Combined shape:", combined_df.shape)
print("Combined columns:", combined_df.columns.tolist())


# Task 1C: Combine D1A with D1C using merge method
# We will use a 'left' merge to keep all rows from D1A and add matching data from D1C.
# To avoid the Cartesian product, we'll drop duplicates from D1C before merging.
d1c_unique = d1c.drop_duplicates(subset=['county'])
comboAC = pd.merge(d1a, d1c_unique, on='county', how='left')


# Display info about merged dataset
print("\n--- Task 1C: Combined D1A and D1C ---")
print("ComboAC shape:", comboAC.shape)
print("ComboAC columns:", comboAC.columns.tolist())
print("Expected records (should be same as D1A):", d1a.shape[0], ", got:", comboAC.shape[0])


D1A shape: (26983, 5)
D1A columns: ['fid', 'name', 'population', 'county', 'latitude']
---
D1B shape: (26983, 7)
D1B columns: ['name', 'population', 'county', 'longitude', 'level1', 'enrollment', 'level2']
---
D1C shape: (26983, 3)
D1C columns: ['county', 'city', 'score']

--- Task 1B: Combined D1A and D1B ---
Combined shape: (27033, 9)
Combined columns: ['fid', 'name', 'population', 'county', 'latitude', 'longitude', 'level1', 'enrollment', 'level2']

--- Task 1C: Combined D1A and D1C ---
ComboAC shape: (26983, 7)
ComboAC columns: ['fid', 'name', 'population', 'county', 'latitude', 'city', 'score']
Expected records (should be same as D1A): 26983 , got: 26983


## Task 2: Custom Dataset Creation and Merging

### A. Create customizedData dataset with specified attributes
### B. Merge customizedData with Lab2 datasets

In [4]:
import pandas as pd
import numpy as np
import random

# Assume d1a, d1b, and d1c are already loaded from the previous step
# d1a = pd.read_csv("Lab2 D1A.csv")
# d1b = pd.read_csv("Lab2 D1B.csv", header=0, names=["name", "population", "county", "longitude", "level1", "enrollment", "level2"])
# d1c = pd.read_csv("Lab2 D1C.csv")

# Take a sample of records to create our custom dataset
sample_size = 1000
custom_data = d1a.sample(n=sample_size, random_state=42).copy()

# Add Size attribute (categorical: small, medium, high)
sizes = ['small', 'medium', 'high']
custom_data['size'] = [random.choice(sizes) for _ in range(sample_size)]

# Add cardinal direction attribute (categorical: North, South, East, West)
directions = ['North', 'South', 'East', 'West']
custom_data['cardinal_direction'] = [random.choice(directions) for _ in range(sample_size)]

# Add Timings attribute (categorical: full time, part time)
timings = ['full time', 'part time']
custom_data['timings'] = [random.choice(timings) for _ in range(sample_size)]

# Add one categorical attribute of my own choice (school type)
school_types = ['public', 'private', 'charter']
custom_data['school_type'] = [random.choice(school_types) for _ in range(sample_size)]

# Add one continuous attribute of my own choice (funding in thousands of dollars)
custom_data['funding'] = np.random.normal(500, 150, sample_size)  # Mean 500k, std 150k
custom_data['funding'] = custom_data['funding'].abs()  # Ensure positive values

# Display info about custom dataset
print("CustomizedData shape:", custom_data.shape)
print("CustomizedData columns:", custom_data.columns.tolist())

# Task 2B: Merge customizedData with Lab2 datasets
print("\n--- Merging Datasets ---")

# Merge the custom data with D1B on common keys
modifiedData = pd.merge(custom_data, d1b, on=['name', 'population', 'county'], how='left')

# De-duplicate D1C to prepare for a clean merge
d1c_unique = d1c.drop_duplicates(subset=['county'])

# Merge the result with the de-duplicated D1C on the 'county' key
modifiedData = pd.merge(modifiedData, d1c_unique, on='county', how='left')

print("\nModifiedData shape after merging:", modifiedData.shape)
print("ModifiedData columns:", modifiedData.columns.tolist())

CustomizedData shape: (1000, 10)
CustomizedData columns: ['fid', 'name', 'population', 'county', 'latitude', 'size', 'cardinal_direction', 'timings', 'school_type', 'funding']

--- Merging Datasets ---

ModifiedData shape after merging: (1000, 16)
ModifiedData columns: ['fid', 'name', 'population', 'county', 'latitude', 'size', 'cardinal_direction', 'timings', 'school_type', 'funding', 'longitude', 'level1', 'enrollment', 'level2', 'city', 'score']
