### Python Data Structure  - Working with Data in Python

# Python Set Complete Guide

## What is a Set?

A set in Python is an unordered collection of unique elements. Sets are mutable (can be modified after creation) but contain only immutable (hashable) elements. Sets are defined using curly braces `{}` or the `set()` constructor and automatically eliminate duplicate values.


In [1]:

## Creating Sets

### Basic Set Creation

# Using curly braces
fruits = {"apple", "banana", "cherry"}
numbers = {1, 2, 3, 4, 5}
mixed = {1, "hello", 3.14, True}

# Using set() constructor
empty_set = set()  # Note: {} creates an empty dictionary, not set
fruits_set = set(["apple", "banana", "cherry"])
string_set = set("hello")  # {'h', 'e', 'l', 'o'} - duplicates removed

# From other iterables
list_to_set = set([1, 2, 2, 3, 3, 4])  # {1, 2, 3, 4}
tuple_to_set = set((1, 2, 3))           # {1, 2, 3}
range_set = set(range(5))               # {0, 1, 2, 3, 4}


In [None]:
### Set Characteristics

# Demonstration of set properties
original_list = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
unique_set = set(original_list)
print(f"Original list: {original_list}")
print(f"Set (unique): {unique_set}")        # {1, 2, 3, 4}
print(f"Length of list: {len(original_list)}")  # 10
print(f"Length of set: {len(unique_set)}")       # 4

# Sets are unordered
set1 = {1, 2, 3}
set2 = {3, 2, 1}
print(set1 == set2)  # True - order doesn't matter


In [None]:
# Adding and Removing Elements

### Adding Elements

fruits = {"apple", "banana"}

# Add single element
fruits.add("cherry")
print(fruits)  # {'apple', 'banana', 'cherry'}

# Add won't create duplicates
fruits.add("apple")  # No effect
print(fruits)  # {'apple', 'banana', 'cherry'}

# Update with multiple elements
fruits.update(["orange", "grape"])
print(fruits)  # {'apple', 'banana', 'cherry', 'orange', 'grape'}

# Update with different iterables
fruits.update("kiwi")  # Adds each character as element
print(fruits)  # {'apple', 'banana', 'cherry', 'orange', 'grape', 'k', 'i', 'w'}

# Update with multiple iterables
numbers = {1, 2, 3}
numbers.update([4, 5], (6, 7), {8, 9})
print(numbers)  # {1, 2, 3, 4, 5, 6, 7, 8, 9}


In [2]:


### Removing Elements

fruits = {"apple", "banana", "cherry", "orange"}

# remove() - raises KeyError if element doesn't exist
fruits.remove("banana")
print(fruits)  # {'apple', 'cherry', 'orange'}

try:
    fruits.remove("grape")  # KeyError
except KeyError:
    print("Element not found")

# discard() - doesn't raise error if element doesn't exist
fruits.discard("apple")
print(fruits)  # {'cherry', 'orange'}
fruits.discard("grape")  # No error, no effect

# pop() - removes and returns arbitrary element
popped = fruits.pop()
print(f"Popped: {popped}")
print(f"Remaining: {fruits}")

# clear() - removes all elements
fruits.clear()
print(fruits)  # set()


{'orange', 'apple', 'cherry'}
Element not found
{'orange', 'cherry'}
Popped: orange
Remaining: {'cherry'}
set()


In [None]:
## Set Operations

### Mathematical Set Operations

#### Union (|, union())

set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}
set3 = {5, 6, 7, 8}

# Union - all unique elements from both sets
union1 = set1 | set2                    # {1, 2, 3, 4, 5, 6}
union2 = set1.union(set2)               # {1, 2, 3, 4, 5, 6}
union3 = set1.union(set2, set3)         # {1, 2, 3, 4, 5, 6, 7, 8}

# Union with other iterables
union4 = set1.union([7, 8, 9])          # {1, 2, 3, 4, 7, 8, 9}

print(f"Set1 ∪ Set2: {union1}")


In [None]:
#### Intersection (&, intersection())

set1 = {1, 2, 3, 4, 5}
set2 = {3, 4, 5, 6, 7}
set3 = {4, 5, 6, 7, 8}

# Intersection - common elements
intersection1 = set1 & set2             # {3, 4, 5}
intersection2 = set1.intersection(set2) # {3, 4, 5}
intersection3 = set1.intersection(set2, set3)  # {4, 5}

print(f"Set1 ∩ Set2: {intersection1}")
print(f"Set1 ∩ Set2 ∩ Set3: {intersection3}")


In [None]:
#### Difference (-, difference())

set1 = {1, 2, 3, 4, 5}
set2 = {3, 4, 5, 6, 7}

# Difference - elements in first set but not in second
diff1 = set1 - set2                     # {1, 2}
diff2 = set1.difference(set2)           # {1, 2}
diff3 = set2 - set1                     # {6, 7}

print(f"Set1 - Set2: {diff1}")
print(f"Set2 - Set1: {diff3}")

# Difference with multiple sets
set3 = {1, 6}
diff4 = set1.difference(set2, set3)     # {2}


In [1]:

#### Symmetric Difference (^, symmetric_difference())

set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}

# Symmetric difference - elements in either set but not in both
sym_diff1 = set1 ^ set2                         # {1, 2, 5, 6}
sym_diff2 = set1.symmetric_difference(set2)     # {1, 2, 5, 6}

print(f"Set1 ⊕ Set2: {sym_diff1}")

# Symmetric difference is commutative
print(set1 ^ set2 == set2 ^ set1)      # True


Set1 ⊕ Set2: {1, 2, 5, 6}
True


In [None]:
### In-Place Set Operations

set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}

# In-place union
original_set1 = set1.copy()
set1 |= set2                    # Same as set1.update(set2)
set1.update(set2)
print(f"After |=: {set1}")     # {1, 2, 3, 4, 5, 6}

# In-place intersection
set1 = {1, 2, 3, 4, 5, 6}
set1 &= {2, 3, 4}              # Same as set1.intersection_update({2, 3, 4})
print(f"After &=: {set1}")     # {2, 3, 4}

# In-place difference
set1 = {1, 2, 3, 4, 5}
set1 -= {3, 4}                 # Same as set1.difference_update({3, 4})
print(f"After -=: {set1}")     # {1, 2, 5}

# In-place symmetric difference
set1 = {1, 2, 3, 4}
set1 ^= {3, 4, 5, 6}           # Same as set1.symmetric_difference_update({3, 4, 5, 6})
print(f"After ^=: {set1}")     # {1, 2, 5, 6}


In [None]:
## Set Relationships and Comparisons

### Subset and Superset Relationships

set1 = {1, 2, 3}
set2 = {1, 2, 3, 4, 5}
set3 = {1, 2}
set4 = {6, 7}

# Subset relationships
print(f"{set1} is subset of {set2}: {set1.issubset(set2)}")           # True
print(f"{set1} <= {set2}: {set1 <= set2}")                            # True
print(f"{set3} is subset of {set1}: {set3.issubset(set1)}")           # True

# Proper subset (strict subset)
print(f"{set1} < {set2}: {set1 < set2}")                              # True
print(f"{set1} < {set1}: {set1 < set1}")                              # False (not proper)

# Superset relationships
print(f"{set2} is superset of {set1}: {set2.issuperset(set1)}")       # True
print(f"{set2} >= {set1}: {set2 >= set1}")                            # True

# Proper superset
print(f"{set2} > {set1}: {set2 > set1}")                              # True

# Disjoint sets (no common elements)
print(f"{set1} and {set4} are disjoint: {set1.isdisjoint(set4)}")     # True
print(f"{set1} and {set2} are disjoint: {set1.isdisjoint(set2)}")     # False


In [None]:
### Set Equality

set1 = {1, 2, 3}
set2 = {3, 2, 1}  # Different order
set3 = {1, 2, 3, 3}  # Duplicate (automatically removed)
set4 = {1, 2, 4}

print(f"{set1} == {set2}: {set1 == set2}")  # True - order doesn't matter
print(f"{set1} == {set3}: {set1 == set3}")  # True - duplicates ignored
print(f"{set1} == {set4}: {set1 == set4}")  # False - different elements


In [None]:
## Set Membership and Iteration

### Membership Testing

fruits = {"apple", "banana", "cherry", "orange"}

# Check membership (very fast - O(1) average case)
print("apple" in fruits)       # True
print("grape" in fruits)       # False
print("grape" not in fruits)   # True

# Membership testing is much faster than lists for large collections
import time

large_list = list(range(100000))
large_set = set(large_list)

# Timing list membership
start = time.time()
99999 in large_list
list_time = time.time() - start

# Timing set membership
start = time.time()
99999 in large_set
set_time = time.time() - start

print(f"List membership time: {list_time:.6f}s")
print(f"Set membership time: {set_time:.6f}s")


In [None]:
### Iterating Through Sets

fruits = {"apple", "banana", "cherry", "orange"}

# Basic iteration (order is arbitrary)
for fruit in fruits:
    print(fruit)

# Enumerate with index
for index, fruit in enumerate(fruits):
    print(f"{index}: {fruit}")

# Sorted iteration
for fruit in sorted(fruits):
    print(fruit)

# Iteration with condition
long_names = [fruit for fruit in fruits if len(fruit) > 5]
print(long_names)  # ['banana', 'cherry', 'orange']


In [None]:
## Set Comprehensions

### Basic Set Comprehensions

# Square numbers
squares = {x**2 for x in range(1, 6)}
print(squares)  # {1, 4, 9, 16, 25}

# Even numbers
evens = {x for x in range(1, 11) if x % 2 == 0}
print(evens)  # {2, 4, 6, 8, 10}

# String manipulation
words = ["hello", "world", "python", "programming"]
lengths = {len(word) for word in words}
print(lengths)  # {5, 6, 7, 11}

# First letters
first_letters = {word[0].upper() for word in words}
print(first_letters)  # {'H', 'W', 'P'}


In [None]:
### Advanced Set Comprehensions

# Nested comprehensions
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
all_elements = {element for row in matrix for element in row}
print(all_elements)  # {1, 2, 3, 4, 5, 6, 7, 8, 9}

# Conditional comprehensions
numbers = range(1, 21)
special_numbers = {
    x for x in numbers 
    if x % 3 == 0 or x % 5 == 0
}
print(special_numbers)  # {3, 5, 6, 9, 10, 12, 15, 18, 20}

# From dictionary
grades = {"Alice": 85, "Bob": 92, "Charlie": 78, "Diana": 96}
high_performers = {name for name, grade in grades.items() if grade >= 90}
print(high_performers)  # {'Bob', 'Diana'}


In [None]:
## Frozen Sets

### Creating and Using Frozen Sets

# Frozen sets are immutable sets
regular_set = {1, 2, 3}
frozen_set = frozenset([1, 2, 3, 4])

print(type(regular_set))    # <class 'set'>
print(type(frozen_set))     # <class 'frozenset'>

# Frozen sets can be used as dictionary keys or set elements
nested_sets = {
    frozenset([1, 2]): "first",
    frozenset([3, 4]): "second"
}

set_of_sets = {
    frozenset([1, 2, 3]),
    frozenset([4, 5, 6]),
    frozenset([7, 8, 9])
}

# Frozen sets support all non-mutating operations
fs1 = frozenset([1, 2, 3])
fs2 = frozenset([2, 3, 4])

print(fs1 | fs2)           # frozenset({1, 2, 3, 4})
print(fs1 & fs2)           # frozenset({2, 3})
print(fs1 - fs2)           # frozenset({1})

# But not mutating operations
try:
    fs1.add(4)  # AttributeError
except AttributeError:
    print("Frozen sets are immutable")


In [None]:
## Practical Applications

### Data Deduplication

def remove_duplicates(data):
    """Remove duplicates while preserving order"""
    seen = set()
    result = []
    for item in data:
        if item not in seen:
            seen.add(item)
            result.append(item)
    return result

# Example usage
data = [1, 2, 3, 2, 4, 3, 5, 1]
unique_data = remove_duplicates(data)
print(f"Original: {data}")
print(f"Unique: {unique_data}")  # [1, 2, 3, 4, 5]

# Alternative using dict (Python 3.7+ maintains order)
def remove_duplicates_dict(data):
    return list(dict.fromkeys(data))

print(remove_duplicates_dict(data))  # [1, 2, 3, 4, 5]


In [None]:
### Finding Common Elements

def find_common_interests(users_interests):
    """Find interests common to all users"""
    if not users_interests:
        return set()
    
    common = set(users_interests[0])
    for interests in users_interests[1:]:
        common &= set(interests)
    
    return common

def find_any_common_interests(users_interests):
    """Find interests shared by at least two users"""
    all_interests = set()
    for interests in users_interests:
        all_interests |= set(interests)
    
    common = set()
    for interest in all_interests:
        count = sum(1 for interests in users_interests if interest in interests)
        if count >= 2:
            common.add(interest)
    
    return common

# Example usage
users = [
    ["reading", "movies", "hiking"],
    ["movies", "cooking", "traveling"],
    ["reading", "movies", "photography"],
    ["movies", "hiking", "music"]
]

print("Common to all:", find_common_interests(users))     # {'movies'}
print("Common to some:", find_any_common_interests(users)) # {'movies', 'reading', 'hiking'}


In [None]:
### Set-based Algorithms

def word_frequency_analysis(text1, text2):
    """Analyze word usage differences between two texts"""
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    return {
        "common_words": words1 & words2,
        "unique_to_text1": words1 - words2,
        "unique_to_text2": words2 - words1,
        "all_words": words1 | words2,
        "similarity_ratio": len(words1 & words2) / len(words1 | words2)
    }

def find_missing_numbers(numbers, expected_range):
    """Find missing numbers in a sequence"""
    number_set = set(numbers)
    expected_set = set(range(expected_range[0], expected_range[1] + 1))
    return sorted(expected_set - number_set)

def group_anagrams(words):
    """Group words that are anagrams of each other"""
    anagram_groups = {}
    for word in words:
        # Use sorted characters as key
        key = ''.join(sorted(word.lower()))
        if key not in anagram_groups:
            anagram_groups[key] = set()
        anagram_groups[key].add(word)
    
    # Return groups with more than one word
    return [group for group in anagram_groups.values() if len(group) > 1]

# Examples
text1 = "the quick brown fox jumps over the lazy dog"
text2 = "a quick brown fox runs through the forest"
analysis = word_frequency_analysis(text1, text2)
print("Text Analysis:", analysis)

numbers = [1, 3, 5, 7, 10, 12]
missing = find_missing_numbers(numbers, (1, 12))
print("Missing numbers:", missing)  # [2, 4, 6, 8, 9, 11]

words = ["eat", "tea", "tan", "ate", "nat", "bat"]
anagrams = group_anagrams(words)
print("Anagram groups:", anagrams)  # [{'eat', 'tea', 'ate'}, {'tan', 'nat'}]


In [None]:
### Permissions and Access Control

class User:
    def __init__(self, name, permissions=None):
        self.name = name
        self.permissions = set(permissions) if permissions else set()
    
    def add_permission(self, permission):
        self.permissions.add(permission)
    
    def remove_permission(self, permission):
        self.permissions.discard(permission)
    
    def has_permission(self, permission):
        return permission in self.permissions
    
    def has_any_permission(self, required_permissions):
        return bool(self.permissions & set(required_permissions))
    
    def has_all_permissions(self, required_permissions):
        return set(required_permissions).issubset(self.permissions)

class Role:
    def __init__(self, name, permissions):
        self.name = name
        self.permissions = set(permissions)

def check_access(user, required_permissions, require_all=True):
    """Check if user has required permissions"""
    if require_all:
        return user.has_all_permissions(required_permissions)
    else:
        return user.has_any_permission(required_permissions)

# Example usage
admin_role = Role("admin", ["read", "write", "delete", "manage_users"])
editor_role = Role("editor", ["read", "write", "edit"])
viewer_role = Role("viewer", ["read"])

# Create users
alice = User("Alice", admin_role.permissions)
bob = User("Bob", editor_role.permissions)
charlie = User("Charlie", viewer_role.permissions)

# Check permissions
print(f"Alice can delete: {alice.has_permission('delete')}")        # True
print(f"Bob can delete: {bob.has_permission('delete')}")            # False
print(f"Charlie can write: {charlie.has_permission('write')}")      # False

# Check multiple permissions
write_permissions = ["read", "write"]
print(f"Bob has write access: {check_access(bob, write_permissions)}")  # True
print(f"Charlie has write access: {check_access(charlie, write_permissions)}")  # False


In [None]:
## Performance Considerations

### Set vs List Performance

import time
import random

# Create test data
list_data = list(range(10000))
set_data = set(list_data)
search_items = random.sample(list_data, 1000)

# Timing membership tests
def time_membership_tests():
    # List membership
    start = time.time()
    for item in search_items:
        item in list_data
    list_time = time.time() - start
    
    # Set membership
    start = time.time()
    for item in search_items:
        item in set_data
    set_time = time.time() - start
    
    print(f"List membership time: {list_time:.4f}s")
    print(f"Set membership time: {set_time:.4f}s")
    print(f"Set is {list_time/set_time:.1f}x faster")

# Memory usage comparison
import sys

def compare_memory_usage():
    test_list = list(range(1000))
    test_set = set(test_list)
    
    list_size = sys.getsizeof(test_list)
    set_size = sys.getsizeof(test_set)
    
    print(f"List memory usage: {list_size} bytes")
    print(f"Set memory usage: {set_size} bytes")
    print(f"Set uses {set_size/list_size:.1f}x more memory")

time_membership_tests()
compare_memory_usage()


In [None]:
### Optimizing Set Operations

def efficient_unique_elements(*iterables):
    """Efficiently find unique elements across multiple iterables"""
    # Use set union for efficiency
    result = set()
    for iterable in iterables:
        result.update(iterable)
    return result

def batch_membership_check(items, valid_items):
    """Check membership for multiple items efficiently"""
    valid_set = set(valid_items)
    return [item for item in items if item in valid_set]

def find_duplicates_efficiently(data):
    """Find duplicates using sets (more efficient than nested loops)"""
    seen = set()
    duplicates = set()
    
    for item in data:
        if item in seen:
            duplicates.add(item)
        else:
            seen.add(item)
    
    return duplicates

# Examples
data1 = [1, 2, 3, 4]
data2 = [3, 4, 5, 6]
data3 = [5, 6, 7, 8]

all_unique = efficient_unique_elements(data1, data2, data3)
print("All unique elements:", all_unique)

items_to_check = [1, 5, 9, 3, 7, 2]
valid_items = [1, 2, 3, 4, 5]
valid_found = batch_membership_check(items_to_check, valid_items)
print("Valid items found:", valid_found)

data_with_duplicates = [1, 2, 3, 2, 4, 3, 5, 1, 6]
duplicates = find_duplicates_efficiently(data_with_duplicates)
print("Duplicates found:", duplicates)


In [None]:
## Advanced Set Techniques

### Custom Hashable Objects in Sets

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __hash__(self):
        return hash((self.x, self.y))
    
    def __eq__(self, other):
        if isinstance(other, Point):
            return self.x == other.x and self.y == other.y
        return False
    
    def __repr__(self):
        return f"Point({self.x}, {self.y})"

# Using custom objects in sets
points = {
    Point(1, 2),
    Point(3, 4),
    Point(1, 2),  # Duplicate, will be ignored
    Point(5, 6)
}

print("Unique points:", points)
print("Contains Point(1, 2):", Point(1, 2) in points)


In [4]:
### Set-based Data Processing Pipeline

class DataProcessor:
    def __init__(self):
        self.processed_ids = set()
        self.error_ids = set()
        self.valid_categories = {"A", "B", "C", "D"}
    
    def process_batch(self, data_batch):
        """Process a batch of data items"""
        successful = set()
        failed = set()
        skipped = set()
        
        for item in data_batch:
            item_id = item.get("id")
            
            # Skip if already processed
            if item_id in self.processed_ids:
                skipped.add(item_id)
                continue
            
            # Validate category
            if item.get("category") not in self.valid_categories:
                failed.add(item_id)
                self.error_ids.add(item_id)
                continue
            
            # Process item (simulate processing)
            try:
                # Simulate processing logic
                if self._process_item(item):
                    successful.add(item_id)
                    self.processed_ids.add(item_id)
                else:
                    failed.add(item_id)
                    self.error_ids.add(item_id)
            except Exception:
                failed.add(item_id)
                self.error_ids.add(item_id)
        
        return {
            "successful": successful,
            "failed": failed,
            "skipped": skipped
        }
    
    def _process_item(self, item):
        # Simulate processing logic
        return item.get("value", 0) > 0
    
    def get_stats(self):
        return {
            "total_processed": len(self.processed_ids),
            "total_errors": len(self.error_ids),
            "error_rate": len(self.error_ids) / (len(self.processed_ids) + len(self.error_ids)) if (len(self.processed_ids) + len(self.error_ids)) > 0 else 0
        }

# Example usage
processor = DataProcessor()

batch1 = [
    {"id": 1, "category": "A", "value": 10},
    {"id": 2, "category": "B", "value": -5},
    {"id": 3, "category": "X", "value": 15},  # Invalid category
    {"id": 4, "category": "C", "value": 20}
]

batch2 = [
    {"id": 1, "category": "A", "value": 12},  # Duplicate ID
    {"id": 5, "category": "D", "value": 8},
    {"id": 6, "category": "A", "value": 0}
]

result1 = processor.process_batch(batch1)
result2 = processor.process_batch(batch2)

print("Batch 1 results:", result1)
print("Batch 2 results:", result2)
print("Processing stats:", processor.get_stats())


Batch 1 results: {'successful': {1, 4}, 'failed': {2, 3}, 'skipped': set()}
Batch 2 results: {'successful': {5}, 'failed': {6}, 'skipped': {1}}
Processing stats: {'total_processed': 3, 'total_errors': 3, 'error_rate': 0.5}
