# Reading a text file

In [44]:
fn = '/home/dataset/simple_text_file.txt'
f = open(fn)

In [45]:
type(f)

_io.TextIOWrapper

In [46]:
for line in f:
    print(line)
f.close()

A quick brown fox jumps over the lazy dog



In [7]:
for line in f:
    print(line)

ValueError: I/O operation on closed file.

In [48]:
with open(fn) as f:
    for line in f:
        print(line)

A quick brown fox jumps over the lazy dog



In [54]:
fn = '/home/dataset/another_simple_text_file.txt'
my_whole_file = []
with open(fn) as f:
    for line in f:
        my_whole_file.append(line.strip())

In [20]:
my_whole_file

['Lorem ipsum dolor sit amet,',
 'consectetur adipisci elit,',
 'sed do eiusmod tempor incidunt ut labore et dolore magna aliqua.',
 'Ut enim ad minim veniam,',
 'quis nostrum exercitationem ullamco laboriosam,',
 'nisi ut aliquid ex ea commodi consequatur.',
 'Duis aute irure reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.',
 'Excepteur sint obcaecat cupiditat non proident,',
 'sunt in culpa qui officia deserunt mollit anim id est laborum.']

In [49]:
print('Lorem ipsum dolor sit amet ')

Lorem ipsum dolor sit amet 


In [50]:
print('Lorem ipsum dolor sit amet')

Lorem ipsum dolor sit amet


In [52]:
' Lorem ipsum dolor sit amet '.strip() == 'Lorem ipsum dolor sit amet'

True

# Write a file

In [61]:
my_new_file_name = 'test.txt'
with open(my_new_file_name, 'w') as f:
    f.write('hello world!')

# Files and directories

In [67]:
import os

In [68]:
path = '/home/dataset'

os.listdir(path)

['simple_text_file.txt',
 'beatles',
 'complete_recipes_dataframe.tsv',
 'another_simple_text_file.txt',
 'Meta']

In [75]:
for x in os.listdir(path):
    complete_path = os.path.join(path, x)
    if os.path.isfile(complete_path):
        print(complete_path, 'is a file')
    else:
        print(complete_path, 'is a directory')

/home/dataset/simple_text_file.txt is a file
/home/dataset/beatles is a directory
/home/dataset/complete_recipes_dataframe.tsv is a file
/home/dataset/another_simple_text_file.txt is a file
/home/dataset/Meta is a directory


In [78]:
path = '/home/marco/main_folder'

for dirpath, dirnames, filenames in os.walk(path):
    print('DIR', dirpath)
    print('SUBDIRS', dirnames)
    print('FILES', filenames)
    print('****')

DIR /home/marco/main_folder
SUBDIRS ['sub_folder_2', 'sub_folder_1', '.ipynb_checkpoints']
FILES []
****
DIR /home/marco/main_folder/sub_folder_2
SUBDIRS ['.ipynb_checkpoints']
FILES ['my_second_file.txt', 'my_first_file.txt']
****
DIR /home/marco/main_folder/sub_folder_2/.ipynb_checkpoints
SUBDIRS []
FILES []
****
DIR /home/marco/main_folder/sub_folder_1
SUBDIRS []
FILES []
****
DIR /home/marco/main_folder/.ipynb_checkpoints
SUBDIRS []
FILES []
****


In [84]:
import glob

path = '/home/marco/main_folder/sub_folder_2/*.json'
glob.glob(path)

['/home/marco/main_folder/sub_folder_2/my_third_file.json']

In [87]:
import requests

In [88]:
r = requests.get('http://api.plos.org/search?q=title:DNA')

In [91]:
import json

In [92]:
json_text = r.text

In [93]:
j_obj = json.loads(json_text)

In [96]:
type(json_text)

str

In [95]:
type(j_obj)

dict

In [106]:
j_obj['response'].keys()

dict_keys(['numFound', 'start', 'maxScore', 'docs'])

# Functions (as a way to extend the language)

In [9]:
def my_fun(a):
    return a**2

In [14]:
def i_dont_return_anything(a):
    a.append('x')

In [13]:
b = my_fun(4)
print(b)
print('done')

16
done


In [21]:
my_list = [1, 2, 3]
i_dont_return_anything(my_list)

In [22]:
my_list

[1, 2, 3, 'x']

In [23]:
a

NameError: name 'a' is not defined

In [24]:
def my_second_fun(a):
    a += 'world'

In [25]:
a = 'hello'
print(a)
my_second_fun(a)
print(a)

hello
hello


In [27]:
return_of_print = print('hello')

hello


In [28]:
type(return_of_print)

NoneType

# Lambda functions

In [32]:
def my_fun(a):
    return a**2

In [30]:
f = lambda x : x**2

In [31]:
f(4)

16

In [33]:
type(my_fun)

function

In [34]:
type(f)

function

In [36]:
[i**2 for i in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [41]:
list(map(lambda x : x**2, range(10)))

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [44]:
import random

a = [(random.randint(0,10), random.randint(0,10)) for i in range(10)]
print(a)

[(6, 7), (9, 10), (1, 9), (2, 2), (10, 10), (5, 5), (6, 4), (9, 2), (10, 6), (3, 6)]


In [48]:
sorted(a, key=lambda x : x[1], reverse=True)

[(9, 10),
 (10, 10),
 (1, 9),
 (6, 7),
 (10, 6),
 (3, 6),
 (5, 5),
 (6, 4),
 (2, 2),
 (9, 2)]

# Function with varying arguments

In [57]:
def my_sum(a, b=0):
    return a + b

In [55]:
my_sum(5, 4)

9

In [56]:
my_sum(5)

5

In [62]:
def my_sum(*args):
    print(type(args))
    #s = 0
    #for x in args:
    #    s += x
    #return s

In [63]:
my_sum(1)

<class 'tuple'>


In [61]:
my_sum(1, 2, 4, 6, 7)

20

In [64]:
print('hello')

hello


In [65]:
print(1, True, 'hello')

1 True hello


### A small detour: * the unpack operator

In [66]:
l = list(range(10))

In [67]:
l

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [68]:
type(l)

list

In [69]:
print(l)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [70]:
print(*l)

0 1 2 3 4 5 6 7 8 9


In [71]:
print(0,1,2,3,4,5,6,7,8,9)

0 1 2 3 4 5 6 7 8 9


In [72]:
a = [1,2,3,4,5]
b = [6,7,8,9,0]

print(a + b)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 0]


In [73]:
list(zip(a, b))

[(1, 6), (2, 7), (3, 8), (4, 9), (5, 0)]

In [74]:
[x+y for x,y in zip(a,b)]

[7, 9, 11, 13, 5]

In [75]:
matrix = [[2, 0, 1, 3], [3, 2, 1, 0], [8, 6, 1, 4], [1, 0, 6, 3]]

In [76]:
matrix

[[2, 0, 1, 3], [3, 2, 1, 0], [8, 6, 1, 4], [1, 0, 6, 3]]

In [77]:
list(zip(*matrix))

[(2, 3, 8, 1), (0, 2, 6, 0), (1, 1, 1, 6), (3, 0, 4, 3)]

In [78]:
list(zip([2, 0, 1, 3], [3, 2, 1, 0], [8, 6, 1, 4], [1, 0, 6, 3]))

[(2, 3, 8, 1), (0, 2, 6, 0), (1, 1, 1, 6), (3, 0, 4, 3)]

In [79]:
def my_array_sum(**kwargs):
    print(kwargs)

In [80]:
my_array_sum(my_first_argument=1, lalala=3, this_is_another_one=True)

{'my_first_argument': 1, 'lalala': 3, 'this_is_another_one': True}


In [None]:
def my_fun(mandatory_arg, default_arg=None, *args, **kwargs):
    print(mandatory_arg)
    print(default_arg)
    print(args)
    print(kwargs)

# Generators

In [82]:
for x in range(10):
    print(x)

0
1
2
3
4
5
6
7
8
9


In [83]:
tuple(range(10))

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

In [84]:
def my_generator():
    n = 1
    print('This is printed first')
    # Generator function contains yield statements
    yield n

    n += 1
    print('This is printed second')
    yield n

    n += 1
    print('This is printed at last')
    yield n
    

In [86]:
for i in my_generator():
    print(i)

This is printed first
1
This is printed second
2
This is printed at last
3


In [88]:
def fibonacci(num):
    a, b = 0, 1
    for x in range(num):
        a, b = b, b + a
        yield b

In [90]:
for i in fibonacci(10):
    print(i)

1
2
3
5
8
13
21
34
55
89


In [91]:
fib_g = fibonacci(10)

In [92]:
def fibonacci(num):
    a, b = 0, 1
    for x in range(num):
        a, b = b, b + a
        yield bfib_g

<generator object fibonacci at 0x7f39ee0b7b50>

In [93]:
for i in fib_g:
    print(i)

1
2
3
5
8
13
21
34
55
89


In [94]:
for i in fib_g:
    print(i)

### Write a function and call it read_file that takes a filename 
### as argument and return a dictionary. Each line is an item, value and key are tab separated.

In [100]:
filename = '/home/dataset/data_records.txt'

In [101]:
!head /home/dataset/data_records.txt

Mushroom-risotto-352492	00001
Filipino-bbq-pork-skewers-352163	00002
Mushroom-and-Roasted-Garlic-Risotto-525026	00003
Gratin-Dauphinois-_scalloped-Potatoes-With-Cheese_-My-Recipes	00004
Delicious-Grilled-Hamburgers-Allrecipes	00005
Hickory_smoked-Bourbon-Turkey-My-Recipes	00006
Marinated-Beef-Tenderloin-529856	00007
Cherry-Tortoni-MyRecipes-211679	00008
Crock-Pot-Italian-Turkey-Meatballs-and-Homemade-Sauce-900153	00009
Chicken-salad-with-wild-rice-364571	00010


In [128]:
def read_file(filename):
    d = {}
    with open(filename) as f:
        for line in f:
            string_list = line.strip().split('\t')
            if not string_list[1] in d:
                d[string_list[1]] = []
            d[string_list[1]].append(string_list[0])
    return d

In [129]:
recipes_dict = read_file(filename)

# Exercises

This exercise is about writing a complete program that will perform several operations. We will try to use all the things we learned so far to write and structure it. The first priority is to have something that works as expected. Then we will focus on how to refactor the code, that is how to improve the readability and reusability of the code by incapsulating code within functions, giving meaninful names, use a consistent nomeclature and be pythonic.

In [131]:
# read a file that contains a list of file names to read and create an index for file contents
# use the index for fast search of words within files
# use the index to draw some statistics about files
# using the frequency of the words create a new text

# first file is a config file with fixed structure
# ID = 1
# SONG = 'Song name'
# FILENAME = 'file.txt'
# =====
# the other files are in the same directory

In [136]:
import os

song_dir = '/home/dataset/beatles/'
song_list = os.path.join(song_dir, 'song_list.txt')

In [154]:
def list_of_songs(song_list_file):
    d = {}
    with open(song_list_file) as f:
        for l in f:
            if l[0] == "=":
                continue
            else:
                line = l
                if line[0] == 'I':
                    a = line.replace('ID=', '').strip()
                    d[a] = []
                elif line[0] == 'S':
                    b = line.replace('SONG=', '').strip()
                    d[a].append(b)
                elif line[0] == 'F':
                    c = line.replace('FILE=', '').strip()
                    d[a].append(c)
    return d

In [205]:
def read_songlist(file):
    all_songs = {}
    d = {}
    with open(file) as f:
        for l in f:
            if l.strip()=="===":
                #print("***")
                #print(d)
                all_songs[d["ID"]] = [d["SONG"],d["FILE"]]
                d = {}
                continue
            else:
                s=l.strip().split('=')
                #print(s)
                d[s[0]] = s[1]
    return all_songs

#songlist = read_songlist(file)
#print(songlist)

In [200]:
def song_organizer(path):
    ids = []
    songs = []
    file = []
    with open(path) as f:
        for x in f:
            if "ID" in x:
                ids.append(x.replace("ID=", '').replace("\n", ''))
            elif "SONG" in x:
                songs.append(x.replace("SONG=", '').replace("\n", ''))
            elif "FILE" in x:
                file.append(x.replace("FILE=", '').replace("\n", ''))

    return dict((i[0], list(i[1:])) for i in zip(ids, songs, file))

#song_organizer(path)

In [191]:
def get_word_count(songs, song_base_directory):
    words = {}
    for k, v in songs.items():
        song_filename = os.path.join(song_base_directory, v[1])
        with open(song_filename) as f:
            for i in range(9):
                next(f)
            for l in f:
                if l.strip() == 'Correct lyrics':
                    break
                if l.strip() == '':
                    continue
                sanitized_line = l.strip().replace('?', '').replace(',', '')
                for word in sanitized_line.split(' '):
                    lowercase_word = word.lower()
                    if not lowercase_word in words:
                        words[lowercase_word] = 0
                    words[lowercase_word] += 1
    return words

In [206]:
#dict_of_songs = list_of_songs(song_list)
#dict_of_songs = song_organizer(song_list)
dict_of_songs = read_songlist(song_list)
word_count = get_word_count(dict_of_songs, song_dir)    

In [207]:
ten_most_frequent_words = sorted(word_count.items(), key=lambda x : x[1], reverse=True)[:10]

In [208]:
ten_most_frequent_words

[('you', 2434),
 ('i', 1763),
 ('the', 1563),
 ('to', 1249),
 ('and', 1114),
 ('me', 1076),
 ('a', 1073),
 ('love', 599),
 ('in', 596),
 ('my', 593)]

In [203]:
ten_most_frequent_words

[('you', 2434),
 ('i', 1763),
 ('the', 1563),
 ('to', 1249),
 ('and', 1114),
 ('me', 1076),
 ('a', 1073),
 ('love', 599),
 ('in', 596),
 ('my', 593)]

In [199]:
ten_most_frequent_words

[('you', 2434),
 ('i', 1763),
 ('the', 1563),
 ('to', 1249),
 ('and', 1114),
 ('me', 1076),
 ('a', 1073),
 ('love', 599),
 ('in', 596),
 ('my', 593)]