# Files Utilities

### Read  Last N lines of a file

when we iterate over a file object, we get each of the file’s lines as a string, one at a time.

In [10]:
def get_final_line(filename,N) -> str:
    last_line = ''
    if filename:
        with open(filename, 'r') as f:
           # loop to read iterate  
            # last n lines and print it
            lines = f.read().splitlines()
            for line in lines[-N:]:
                print(line)
            
        
        
    

In [11]:
get_final_line('/etc/passwd',2)

_reportmemoryexception:*:269:269:ReportMemoryException:/var/db/reportmemoryexception:/usr/bin/false
_driverkit:*:270:270:DriverKit:/var/empty:/usr/bin/false


### Read  first and last lines of a file

In [12]:
with open("sample.txt", "r") as file:
    first_line = file.readline()
    for last_line in file:
        pass
print(f'first line -> {first_line}')
print(f'last line -> {last_line}')

first line -> This is the first line.

last line -> This is the last line.


### Read binary file 

In binary file we donot read the file per line. Instead, we should be using the read method to retrieve a fixed number of bytes. When read returns 0 bytes, we’ll know that we’re at the end of the file

In [13]:
def readbinary_file(filename, CHUNKSIZE):
    with open("filename", "rb") as f:
        bytes_read = f.read(CHUNKSIZE)
        while bytes_read:
            for b in bytes_read:
                process_byte(b)
            bytes_read = f.read(CHUNKSIZE)

### /etc/passwd to dict

In [14]:
def passwd_to_dict(filename):
    users = {}
    with open(filename) as passwd:
        for line in passwd:
            if not line.startswith(('#', '\n')):
                user_info = line.split(':')
                users[user_info[0]] = int(user_info[2])
    return users

In [15]:
print(passwd_to_dict('/etc/passwd'))

{'nobody': -2, 'root': 0, 'daemon': 1, '_uucp': 4, '_taskgated': 13, '_networkd': 24, '_installassistant': 25, '_lp': 26, '_postfix': 27, '_scsd': 31, '_ces': 32, '_appstore': 33, '_mcxalr': 54, '_appleevents': 55, '_geod': 56, '_devdocs': 59, '_sandbox': 60, '_mdnsresponder': 65, '_ard': 67, '_www': 70, '_eppc': 71, '_cvs': 72, '_svn': 73, '_mysql': 74, '_sshd': 75, '_qtss': 76, '_cyrus': 77, '_mailman': 78, '_appserver': 79, '_clamav': 82, '_amavisd': 83, '_jabber': 84, '_appowner': 87, '_windowserver': 88, '_spotlight': 89, '_tokend': 91, '_securityagent': 92, '_calendar': 93, '_teamsserver': 94, '_update_sharing': 95, '_installer': 96, '_atsserver': 97, '_ftp': 98, '_unknown': 99, '_softwareupdate': 200, '_coreaudiod': 202, '_screensaver': 203, '_locationd': 205, '_trustevaluationagent': 208, '_timezone': 210, '_lda': 211, '_cvmsroot': 212, '_usbmuxd': 213, '_dovecot': 214, '_dpaudio': 215, '_postgres': 216, '_krbtgt': 217, '_kadmin_admin': 218, '_kadmin_changepw': 219, '_devicemgr

### Word Count Function

function will take a filename as input and will print four lines of output:

 - Number of characters (including whitespace)
 - Number of words (separated by whitespace)
 - Number of lines
 - Number of unique words (case sensitive, so “NO” is different from “no”)

In [17]:
def word_count(file):
    wordcount = {char_count : 0, no_words: 0, no_lines:0, unique_words: 0 }
    if file:
        with open(file, 'r') as f:
            for line in f:
                pass
                
                

### Directory listings

In [11]:
import glob

filenames = glob.glob('/etc/*.conf')
filenames

['/etc/syslog.conf',
 '/etc/kern_loader.conf',
 '/etc/rtadvd.conf',
 '/etc/pf.conf',
 '/etc/autofs.conf',
 '/etc/ntp_opendirectory.conf',
 '/etc/resolv.conf',
 '/etc/nfs.conf',
 '/etc/asl.conf',
 '/etc/ntp.conf',
 '/etc/man.conf',
 '/etc/newsyslog.conf',
 '/etc/notify.conf']

only dir listing under dir-'misc' 

In [12]:
filenames = glob.glob('/Users/manishgarg/misc/*/',recursive=True)
filenames



['/Users/manishgarg/misc/dir2/',
 '/Users/manishgarg/misc/scores/',
 '/Users/manishgarg/misc/dir1/']

 dir and files listing under dir-'misc'

In [14]:
filenames = glob.glob('/Users/manishgarg/misc/*',recursive=True)
filenames

['/Users/manishgarg/misc/dir2',
 '/Users/manishgarg/misc/reverse_passwd.txt',
 '/Users/manishgarg/misc/scores',
 '/Users/manishgarg/misc/dir1']

#### List of files in directory and sub directories

In [28]:
import os
'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    # Gives List of all files & dir under dirName
    listOfFile = os.listdir(dirName)
    print('----------------------------------------')
    print(f'List of all files & dir under {dirName}')
    print('----------------------------------------')
    print(listOfFile)    
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        print(f'fullPath is {fullPath}') 
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

In [29]:
getListOfFiles('/Users/manishgarg/misc')

----------------------------------------
List of all files & dir under /Users/manishgarg/misc
----------------------------------------
['.DS_Store', 'dir2', 'reverse_passwd.txt', 'scores', 'dir1']
fullPath is /Users/manishgarg/misc/.DS_Store
fullPath is /Users/manishgarg/misc/dir2
----------------------------------------
List of all files & dir under /Users/manishgarg/misc/dir2
----------------------------------------
['nums.txt', 'output.txt']
fullPath is /Users/manishgarg/misc/dir2/nums.txt
fullPath is /Users/manishgarg/misc/dir2/output.txt
fullPath is /Users/manishgarg/misc/reverse_passwd.txt
fullPath is /Users/manishgarg/misc/scores
----------------------------------------
List of all files & dir under /Users/manishgarg/misc/scores
----------------------------------------
['9b.json', '9a.json', 'json-files.zip']
fullPath is /Users/manishgarg/misc/scores/9b.json
fullPath is /Users/manishgarg/misc/scores/9a.json
fullPath is /Users/manishgarg/misc/scores/json-files.zip
fullPath is /Us

['/Users/manishgarg/misc/.DS_Store',
 '/Users/manishgarg/misc/dir2/nums.txt',
 '/Users/manishgarg/misc/dir2/output.txt',
 '/Users/manishgarg/misc/reverse_passwd.txt',
 '/Users/manishgarg/misc/scores/9b.json',
 '/Users/manishgarg/misc/scores/9a.json',
 '/Users/manishgarg/misc/scores/json-files.zip',
 '/Users/manishgarg/misc/dir1/passwd.csv',
 '/Users/manishgarg/misc/dir1/output.csv']

#### list of files in directory and sub directories using os.walk()

Python’s os module provides a function to iterate over a directory tree i.e.
It iterates of the directory tree at give path and for each directory or sub directory it returns a tuple containing - dirname ,ListofSubDirs , ListofFiles.
    
    
Iterate over the directory tree and generate a list of all the files at given path,

In [33]:
import os
'''
    For the given path, get the List of all files in the directory tree 
'''
def getListOfFilesUsingWalk(dirName):    
    # Get the list of all files in directory tree at given path
    listOfFiles = list()
    for (dirpath, dirnames, filenames) in os.walk(dirName):
        listOfFiles += [os.path.join(dirpath, file) for file in filenames]
        
        
    # Print the files    
    for elem in listOfFiles:
        print(elem)    

In [37]:
getListOfFilesUsingWalk('/Users/manishgarg/misc')

/Users/manishgarg/misc/.DS_Store
/Users/manishgarg/misc/reverse_passwd.txt
/Users/manishgarg/misc/dir2/nums.txt
/Users/manishgarg/misc/dir2/output.txt
/Users/manishgarg/misc/scores/9b.json
/Users/manishgarg/misc/scores/9a.json
/Users/manishgarg/misc/scores/json-files.zip
/Users/manishgarg/misc/dir1/passwd.csv
/Users/manishgarg/misc/dir1/output.csv


#### list of all files with a specific extension 

we will take a path of a directory and try to list all the files, with a specific extension .py here, in the directory and its sub-directories recursively.

In [41]:
import os

def getListOfFilesWithExtn(path, extension): 
    for root, dirs, files in os.walk(path):
        for file in files:
            if(file.endswith(extension)):
                print(os.path.join(root,file))

In [42]:
getListOfFilesWithExtn('/Users/manishgarg/misc', '.txt')

/Users/manishgarg/misc/reverse_passwd.txt
/Users/manishgarg/misc/dir2/nums.txt
/Users/manishgarg/misc/dir2/output.txt


#### Reading from CSV

In [49]:
import pandas as pd

In [50]:
df_csv  = pd.read_csv('output.csv',delimiter='\t',header=None)

In [51]:
df_csv

Unnamed: 0,0,1
0,root,0
1,daemon,1
2,bin,2
3,sys,3
4,sync,4
5,games,5
6,man,6
7,lp,7
8,mail,8
9,atara,1004


### Reading from Json 

In [53]:
df_json = pd.read_json('./scores/9a.json')

In [54]:
df_json

Unnamed: 0,math,literature,science
0,90,98,97
1,65,79,85
2,78,83,75
3,92,78,85
4,100,80,90
