# os module – File & Directory Operations

**Q:1 Create, Read, Rename, Remove files and folder**

**Q:1 Get the current working directory and navigate to a sibling directory**

**Q:2 Recursively list all files in a directory using only os.**

**Q:3 Check if a given path is a file, directory, or doesn't exist.**

**Q:4 Create a deeply nested folder structure like**

**Q:5 Delete all empty directories from a given folder tree.**

**Q:6 Count the number of .txt files in a directory using os.listdir().**

**Q:7 Move files from one folder to another, creating the destination if needed.**

**Q:8 Rename all .log files to .log.bak within a folder.**

**Q:9 Print the total size of all files in a directory in MB.**

**Q:10 Print the directory tree with indentation (like the tree command).**

**Q:11 Write a function that synchronizes the structure of two directory trees (mirror mode).**

**Q:12 Implement a safe folder deletion function that first moves the folder to a Trash directory.**

**Q13: Find and print the most recently modified file in a directory recursively.**

**Q:14 Generate a directory report (file count, total size, subfolders) in JSON format.**

**Q:15 Track changes (additions/removals) in a directory over time using file snapshots.**



In [76]:
import os

base_path = r"C:\ML1 BootCamp\Muhammad Humza\Day 4 Task"

# Q1 - Create, Read, Rename, Remove files and folder
os.mkdir(os.path.join(base_path, 'test_folder'))
with open(os.path.join(base_path, 'test_folder', 'sample.txt'), 'w') as f:
    f.write('Hello')
with open(os.path.join(base_path, 'test_folder', 'sample.txt'), 'r') as f:
    print(f.read())
os.rename(
    os.path.join(base_path, 'test_folder', 'sample.txt'),
    os.path.join(base_path, 'test_folder', 'renamed.txt')
)
os.remove(os.path.join(base_path, 'test_folder', 'renamed.txt'))
os.rmdir(os.path.join(base_path, 'test_folder'))


Hello


In [78]:
#Get current working directory and navigate to sibling directory
def q1b():
    cwd = os.getcwd()
    parent = os.path.dirname(cwd)
    sibling = os.path.join(parent, 'sibling_directory')
    print(cwd)
    print(sibling)

In [80]:
# Q2 - Recursively list all files
for root, dirs, files in os.walk(base_path):
    for file in files:
        print(os.path.join(root, file))


C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Filing.ipynb
C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Solved_Filing.ipynb
C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Custom CNN\test_folder\renamed.txt


In [82]:
# Q3 - Check if path is file, directory, or doesn't exist
path = os.path.join(base_path, 'test_folder')
if os.path.isfile(path):
    print('File')
elif os.path.isdir(path):
    print('Directory')
else:
    print('Does not exist')



Does not exist


In [84]:
# Q4 - Create deeply nested folders
os.makedirs(os.path.join(base_path, 'a/b/c/d/e'))

In [86]:
# Q5 - Delete all empty directories
for root, dirs, files in os.walk(base_path, topdown=False):
    for d in dirs:
        full_path = os.path.join(root, d)
        if not os.listdir(full_path):
            os.rmdir(full_path)

In [88]:
# Q6 - Count .txt files using os.listdir()
count = 0
for file in os.listdir(base_path):
    if file.endswith('.txt'):
        full = os.path.join(base_path, file)
        if os.path.isfile(full):
            count += 1
print(count)

0


In [90]:
# Q7 - Move files to another folder (create destination if needed)
src = os.path.join(base_path, 'move_from')
dst = os.path.join(base_path, 'move_to')
if not os.path.exists(dst):
    os.makedirs(dst)
if os.path.exists(src):
    for file in os.listdir(src):
        s = os.path.join(src, file)
        d = os.path.join(dst, file)
        if os.path.isfile(s):
            os.rename(s, d)

In [92]:
# Q8 - Rename all .log files to .log.bak
for file in os.listdir(base_path):
    if file.endswith('.log'):
        full = os.path.join(base_path, file)
        if os.path.isfile(full):
            os.rename(full, full + '.bak')


In [94]:
# Q9 - Total size of all files in MB
total = 0
for file in os.listdir(base_path):
    full = os.path.join(base_path, file)
    if os.path.isfile(full):
        total += os.path.getsize(full)
print(total / (1024 * 1024))


0.042804718017578125


In [96]:
# Q10 - Print directory tree with indentation
def tree(path, indent=0):
    for item in os.listdir(path):
        full = os.path.join(path, item)
        print(' ' * indent + item)
        if os.path.isdir(full):
            tree(full, indent + 4)
tree(base_path)

Custom CNN
    test_folder
        renamed.txt
Filing.ipynb
move_to
Solved_Filing.ipynb


In [98]:
# Q11 - Sync structure of two directory trees (mirror mode)
src = os.path.join(base_path, 'dir1')
dst = os.path.join(base_path, 'dir2')
if os.path.exists(src):
    for root, dirs, files in os.walk(src):
        rel_path = os.path.relpath(root, src)
        target_root = os.path.join(dst, rel_path)
        if not os.path.exists(target_root):
            os.makedirs(target_root)
        for file in files:
            src_file = os.path.join(root, file)
            dst_file = os.path.join(target_root, file)
            if not os.path.exists(dst_file):
                with open(src_file, 'rb') as f1, open(dst_file, 'wb') as f2:
                    f2.write(f1.read())

In [100]:
# Q12 - Safe delete to Trash folder
trash = os.path.join(os.path.expanduser('~'), 'Trash')
if not os.path.exists(trash):
    os.makedirs(trash)
target = os.path.join(base_path, 'folder_to_delete')
if os.path.exists(target):
    new_path = os.path.join(trash, os.path.basename(target))
    os.rename(target, new_path)

In [102]:
# Q13 - Most recently modified file recursively
latest = ''
latest_time = 0
for root, dirs, files in os.walk(base_path):
    for file in files:
        full = os.path.join(root, file)
        mtime = os.path.getmtime(full)
        if mtime > latest_time:
            latest_time = mtime
            latest = full
print(latest)


C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Custom CNN\test_folder\renamed.txt


In [104]:
# Q14 - Directory report
files = 0
size = 0
subfolders = 0
for root, dirs, file_list in os.walk(base_path):
    files += len(file_list)
    subfolders += len(dirs)
    for file in file_list:
        size += os.path.getsize(os.path.join(root, file))
report = '{ "files": %d, "size": %d, "subfolders": %d }' % (files, size, subfolders)
print(report)


{ "files": 3, "size": 44889, "subfolders": 3 }


In [126]:
# Q15 - Snapshot changes
import time, pickle
directory = r"C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Custom CNN"

t1 = time.time()
s1 = {os.path.relpath(os.path.join(r, f), directory): os.path.getmtime(os.path.join(r, f))
      for r, _, files in os.walk(directory) for f in files if os.path.exists(os.path.join(r, f))}
with open('snapshot1.pkl', 'wb') as f: pickle.dump((t1, s1), f)

input()

t2 = time.time()
s2 = {os.path.relpath(os.path.join(r, f), directory): os.path.getmtime(os.path.join(r, f))
      for r, _, files in os.walk(directory) for f in files if os.path.exists(os.path.join(r, f))}
with open('snapshot2.pkl', 'wb') as f: pickle.dump((t2, s2), f)

with open('snapshot1.pkl', 'rb') as f: t1, s1 = pickle.load(f)
with open('snapshot2.pkl', 'rb') as f: t2, s2 = pickle.load(f)

a = set(s2) - set(s1)
r = set(s1) - set(s2)
m = {f for f in s1 if f in s2 and s1[f] != s2[f]}

print("Changes from", time.ctime(t1), "to", time.ctime(t2))
print("Added:", a)
print("Removed:", r)
print("Modified:", m)


 


Changes from Sun Jun 22 06:09:57 2025 to Sun Jun 22 06:09:58 2025
Added: set()
Removed: set()
Modified: set()



# glob module – Pattern Matching

**Q:1 List all .csv and .json files in the current directory.**

**Q:2 Recursively find all .jpg files in nested folders.**

**Q:3 Use glob to count files grouped by extension.**

**Q:4 Find files with names matching pattern report_*.txt.**

**Q:5 Replace spaces with underscores in filenames found via glob.**

**Q:6 Return all files with a date in the format 2025-06-*.log.**

**Q:7 List all files with numeric names only (e.g., 123.txt).**

**Q:8 Use glob to sort files by last modified time.**

**Q:9 Find all .txt files larger than 100KB using glob and os.**

**Q:10 Batch rename files with a custom suffix _archived.**

**Q:11 Create a utility that indexes all media files and stores the paths in a SQLite DB.**

**Q:12 Find duplicate filenames (regardless of path) across a directory tree.**

**Q:13 Generate a file manifest with relative paths and hash (MD5) of contents.**

**Q14: Use glob patterns dynamically to extract weekly reports (e.g., week_01.json, week_02.json).**

**Q:15 Write a recursive file crawler that ignores folders listed in a .ignore file.**


In [2]:
import os
import glob
import hashlib
import sqlite3
from collections import defaultdict

path = r"C:\ML1 BootCamp\Muhammad Humza\Day 4 Task"

q1 = glob.glob(os.path.join(path, "*.csv")) + glob.glob(os.path.join(path, "*.json"))
print("Q1:", q1)

Q1: []


In [4]:
q2 = glob.glob(os.path.join(path, "**", "*.jpg"), recursive=True)
print("Q2:", q2)


Q2: []


In [6]:
q3 = defaultdict(int)
for f in glob.glob(os.path.join(path, "**", "*.*"), recursive=True):
    ext = os.path.splitext(f)[1]
    q3[ext] += 1
print("Q3:", dict(q3))

Q3: {'.ipynb': 2, '.txt': 1}


In [28]:
q4 = glob.glob(os.path.join(path, "**", "new*.txt"), recursive=True)
print("Q4:", q4)


Q4: ['C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder\\new.txt']


In [35]:
path = r"C:\ML1 BootCamp\Muhammad Humza\Day 4 Task"
q5 = []

for f in glob.glob(os.path.join(path, "**", "* *"), recursive=True):
    dir_name = os.path.dirname(f)
    base_name = os.path.basename(f)
    new_base = base_name.replace(" ", "_")
    new_path = os.path.join(dir_name, new_base)

    try:
        os.rename(f, new_path)
        q5.append((f, new_path))
    except PermissionError:
        print("Skipped (in use):", f)

print("Q5:", q5)

Skipped (in use): C:\ML1 BootCamp\Muhammad Humza\Day 4 Task\Custom CNN
Q5: []


In [37]:
q6 = glob.glob(os.path.join(path, "2025-06-*.log"))
print("Q6:", q6)

Q6: []


In [39]:
q7 = []
for f in glob.glob(os.path.join(path, "**", "[0-9]*.txt"), recursive=True):
    if os.path.basename(f).split('.')[0].isdigit():
        q7.append(f)
print("Q7:", q7)

Q7: []


In [41]:
q8 = sorted(glob.glob(os.path.join(path, "**", "*"), recursive=True), key=os.path.getmtime)
print("Q8:", q8)


Q8: ['C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Filing.ipynb', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\move_to', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder\\new.txt', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Solved_Filing.ipynb']


In [43]:
q9 = []
for f in glob.glob(os.path.join(path, "**", "*.txt"), recursive=True):
    if os.path.getsize(f) > 100 * 1024:
        q9.append(f)
print("Q9:", q9)

Q9: []


In [45]:
q10 = []
for f in glob.glob(os.path.join(path, "**", "*.*"), recursive=True):
    root, ext = os.path.splitext(f)
    new = root + "_archived" + ext
    os.rename(f, new)
    q10.append((f, new))
print("Q10:", q10)

media_exts = ['*.mp4', '*.mp3', '*.avi', '*.mov', '*.jpg', '*.png']

Q10: [('C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Filing.ipynb', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Filing_archived.ipynb'), ('C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Solved_Filing.ipynb', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Solved_Filing_archived.ipynb'), ('C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder\\new.txt', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder\\new_archived.txt')]


In [47]:
q11 = []
for ext in media_exts:
    q11 += glob.glob(os.path.join(path, "**", ext), recursive=True)
conn = sqlite3.connect("media_index.db")
c = conn.cursor()
c.execute("CREATE TABLE IF NOT EXISTS media (path TEXT)")
c.executemany("INSERT INTO media (path) VALUES (?)", [(f,) for f in q11])
conn.commit()
conn.close()
print("Q11: Media files indexed in SQLite DB")

Q11: Media files indexed in SQLite DB


In [49]:
q12_all = glob.glob(os.path.join(path, "**", "*.*"), recursive=True)
name_map = defaultdict(list)
for f in q12_all:
    name = os.path.basename(f)
    name_map[name].append(f)
q12 = {k: v for k, v in name_map.items() if len(v) > 1}
print("Q12:", q12)

Q12: {}


In [51]:
q13 = []
for f in q12_all:
    try:
        with open(f, "rb") as file:
            content = file.read()
        rel_path = os.path.relpath(f, path)
        md5_hash = hashlib.md5(content).hexdigest()
        q13.append((rel_path, md5_hash))
    except:
        pass
print("Q13:", q13)


Q13: [('Filing_archived.ipynb', '1ca3d253067a812a3e37f10ac708038b'), ('Solved_Filing_archived.ipynb', 'c7004f74de7cacf5d6b2f83723979487'), ('Custom CNN\\test_folder\\new_archived.txt', 'd41d8cd98f00b204e9800998ecf8427e')]


In [53]:
q14 = glob.glob(os.path.join(path, "week_*.json"))
print("Q14:", q14)

ignore_list = []
if os.path.exists(os.path.join(path, ".ignore")):
    with open(os.path.join(path, ".ignore")) as f:
        ignore_list = [line.strip() for line in f if line.strip()]

Q14: []


In [55]:
q15 = []
for root, dirs, files in os.walk(path):
    if any(ignored in root for ignored in ignore_list):
        continue
    for file in files:
        q15.append(os.path.join(root, file))
print("Q15:", q15)

Q15: ['C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Filing_archived.ipynb', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Solved_Filing_archived.ipynb', 'C:\\ML1 BootCamp\\Muhammad Humza\\Day 4 Task\\Custom CNN\\test_folder\\new_archived.txt']



# File Handling – Text & Binary Files

**Q:1 Count the number of lines in a file without loading it entirely.**

**Q:2 Replace a specific word in a file and save it to a new file.**

**Q:3 Append data to an existing file with a timestamp.**

**Q:4 Read and print the first 10 lines of a file.**

**Q:5 Write a list of dictionaries as CSV manually (without csv module).**

**Q:6 Copy a binary file in chunks (e.g., image or PDF).**

**Q:7 Write a function to compare two files and print the differing lines.**

**Q:8 Safely read a file that may not exist using try-except.**

**Q:9 Read a file using a specific encoding (e.g., UTF-16).**

**Q:10 Detect and skip empty lines when reading a file.**

**Q:11 Implement a log rotation mechanism: create log.txt, log_1.txt, etc. when size exceeds 1MB.**

**Q:12 Build a file-based key-value store using JSON per line.**

**Q:13 Implement version control: on every write, back up the previous version with a timestamp.**

**Q:14 Create a reader that detects encoding using chardet or fallback encoding.**

**Q:15 Convert a large log file into separate files per date based on timestamps in each line.**



# JSON Handling – json module

**Q:1 Load JSON from a file and print a nested field (e.g., data["user"]["name"]).**

**Q:2 Write a Python dict to a file with pretty formatting.**

**Q:3 Merge multiple JSON objects into a single file.**

**Q:4 Convert a JSON array into CSV format.**

**Q:5 Update a nested key inside a loaded JSON.**

**Q:6 Create a function to pretty-print JSON from string input.**

**Q:7 Safely load malformed JSON with exception handling.**

**Q:8 Remove a key from each item in a JSON list and re-save.**

**Q:9 Convert an object with datetime to a JSON string using a custom encoder.**

**Q:10 Search for all values associated with a key in nested JSON.**

**Q:11 Write a function to flatten deeply nested JSON into a flat dictionary.**

**Q:12 Build a recursive JSON validator for required schema keys.**

**Q:13 Convert a nested JSON into a pandas DataFrame with normalized columns.**

**Q:14 Create a diff tool that compares two JSON files and shows key-level changes.**

**Q:15 Handle and fix trailing commas in malformed JSON before parsing.**



    
# Regular Expressions – re module

**Q:1 Extract email addresses from a string using re.findall().**

**Q:2 Validate a US phone number using regex.**

**Q:3 Extract hashtags from a tweet-like string.**

**Q:4 Replace all numbers with # in a paragraph.**

**Q:5 Match filenames with extension .pdf, .docx, or .xlsx.**

**Q:6 Split a paragraph into sentences using regex.**

**Q:7 Match a date in the format DD-MM-YYYY or YYYY/MM/DD.**

**Q:8 Extract quoted strings from text (e.g., "like this").**

**Q:9 Clean a text by removing special characters except alphanumerics and spaces.**

**Q:10 Capture repeated words like the the, is is in a sentence.**

**Q:11 Write a regex that extracts values from key-value pairs (key: value) even if keys contain spaces.**

**Q:12 Extract nested parentheses using recursive regex (advanced feature).**

**Q:13 Create a regex to detect and fix malformed URLs in a text block.**

**Q:14 Build a pattern to extract address-like strings (e.g., 123 Main St, City, ZIP).**

**Q:15 Tokenize a log line into timestamp, level, and message using regex groups.**

In [128]:
import re
text = "Contact us at john.doe@example.com and jane_doe123@test.co.uk"
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
print(emails)


['john.doe@example.com', 'jane_doe123@test.co.uk']


In [130]:
text = "Call me at 123-456-7890 or (123) 456-7890"
phones = re.findall(r'\(?\d{3}\)?[-\s]?\d{3}-\d{4}', text)
print(phones)

['123-456-7890', '(123) 456-7890']


In [132]:
text = "Loving the #weather and #sunshine at the #beach"
hashtags = re.findall(r'#\w+', text)
print(hashtags)


['#weather', '#sunshine', '#beach']


In [134]:
text = "My number is 12345 and zip is 90210"
replaced = re.sub(r'\d', '#', text)
print(replaced)


My number is ##### and zip is #####


In [136]:
text = "Files: report.pdf, data.xlsx, notes.docx, image.png"
files = re.findall(r'\b\w+\.(pdf|docx|xlsx)\b', text)
print(files)


['pdf', 'xlsx', 'docx']


In [138]:
text = "Hello there! How are you? I'm fine. Thanks for asking."
sentences = re.split(r'(?<=[.!?])\s+', text)
print(sentences)


['Hello there!', 'How are you?', "I'm fine.", 'Thanks for asking.']


In [140]:
text = "Dates: 25-12-2024 and 2024/12/25 are important."
dates = re.findall(r'\b\d{2}-\d{2}-\d{4}\b|\b\d{4}/\d{2}/\d{2}\b', text)
print(dates)


['25-12-2024', '2024/12/25']


In [142]:
text = 'She said "hello" and then "goodbye".'
quotes = re.findall(r'"(.*?)"', text)
print(quotes)


['hello', 'goodbye']


In [144]:
text = "Hello! How's it going? $100 #amazing @Python"
cleaned = re.sub(r'[^a-zA-Z0-9 ]+', '', text)
print(cleaned)


Hello Hows it going 100 amazing Python


In [146]:
text = "This is is a test to see the the repeated words."
repeats = re.findall(r'\b(\w+)\s+\1\b', text, re.IGNORECASE)
print(repeats)


['is', 'the']


In [148]:
text = "Name: John Doe, Age: 30, Address: 123 Main St"
values = re.findall(r'\b[\w\s]+:\s*([^,]+)', text)
print(values)


['John Doe', '30', '123 Main St']


In [152]:
text = "Here is (a sample (nested (parentheses)))"
nested = re.findall(r'\([^()]*\)', text)
print(nested)


['(parentheses)']


In [154]:
text = "Visit www.example.com or http:/example.org"
fixed = re.sub(r'\b(www\.[^\s]+)', r'http://\1', text)
fixed = re.sub(r'\bhttp:/([^\s])', r'http://\1', fixed)
print(fixed)


Visit http:///www.example.com or http://example.org


In [156]:
text = "Send it to 123 Main St, Springfield, 12345 now."
addresses = re.findall(r'\d+\s+[A-Za-z ]+,\s*\w+,\s*\d{5}', text)
print(addresses)


['123 Main St, Springfield, 12345']


In [158]:
text = "2025-06-20 14:22:10 [INFO] Application started"
match = re.match(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)', text)
if match:
    print(match.groups())


('2025-06-20 14:22:10', 'INFO', 'Application started')
