### 10. 行数のカウント

python

In [1]:
with open('data/popular-names.txt') as f:
    print(len(list(f)))

2780


In [2]:
with open('data/popular-names.txt') as f:
    for i, _ in enumerate(f, start=1):
        pass
i

2780

shell

In [3]:
! wc -l < data/popular-names.txt

2780


### 11. タブをスペースに置換

python

In [4]:
with open('data/popular-names.txt') as f:
    for line in list(f)[:10]: # 本当はこんなことしないでfのままでいいんだけど，github上で閲覧するときにスクロールが面倒なので妥協
        line = line.strip()
        line = line.replace('\t', ' ')
        print(line)

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


shell

In [5]:
! awk '{gsub("\t", " ", $0); print $0}' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


In [6]:
! perl -pe 's/\t/ /g' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
-p destination: Broken pipe


In [7]:
! sed 's/\t/ /g'  data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
sed: couldn't write 18 items to stdout: Broken pipe


In [8]:
! sed -e 's/[[:space:]]\+/ /g' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
sed: couldn't write 19 items to stdout: Broken pipe


In [9]:
! expand -t 1 data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


In [10]:
! tr '\t' ' ' < data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


### 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

python

In [11]:
with open('data/popular-names.txt') as f, \
        open('result/col1.txt', 'w') as g, \
        open('result/col2.txt', 'w') as h:
    for line in f:
        line = line.strip()
        pref, city, _, _  = line.split('\t')
        print(pref, file=g)
        print(city, file=h)

In [12]:
! head result/col1.txt

Mary
Anna
Emma
Elizabeth
Minnie
Margaret
Ida
Alice
Bertha
Sarah


In [13]:
! head result/col2.txt

F
F
F
F
F
F
F
F
F
F


shell

In [14]:
! cut -f 1 data/popular-names.txt | head

Mary
Anna
Emma
Elizabeth
Minnie
Margaret
Ida
Alice
Bertha
Sarah


In [15]:
! cut -f 2 data/popular-names.txt | head

F
F
F
F
F
F
F
F
F
F


### 13. col1.txtとcol2.txtをマージ

python

In [16]:
with open('result/col1.txt') as f, open('result/col2.txt') as g:
    for lines in list(zip(f, g))[:10]: # zip(f, g)でいい
        x = [line.strip() for line in lines]
        x = '\t'.join(x)
        print(x)

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F


In [17]:
from contextlib import ExitStack

In [18]:
files = ['result/col1.txt', 'result/col2.txt']
with ExitStack() as stack:
    files = [
        stack.enter_context(open(filename))
        for filename in files]
    for lines in list(zip(*files))[:10]: # zip(*files)でいい
        x = [line.strip() for line in lines]
        x = '\t'.join(x)
        print(x)

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F


shell

In [19]:
! paste result/col1.txt result/col2.txt | head

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F


### 14. 先頭からN行を出力

python

In [20]:
N = 5
with open('data/popular-names.txt') as f:
    lst = range(N)
    for _, line in zip(lst, f):
        print(line, end='')

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


shell

In [21]:
! head -n 5 data/popular-names.txt

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


### 15. 末尾のN行を出力

python

In [22]:
from collections import deque

In [23]:
N = 5
with open('data/popular-names.txt') as f:
    queue = deque(f, N)
for line in queue:
    line = line.strip()
    print(line)

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


shell

In [24]:
! tail -n 5 data/popular-names.txt

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


### 16. ファイルをN分割する

shell

In [25]:
! split -d -nl/5 data/popular-names.txt result/shell5.

In [26]:
! wc result/shell*

  2780  11120  55026 result/shell.18
   136    136    943 result/shell.19
   587   2348  11007 result/shell5.00
   554   2216  11010 result/shell5.01
   556   2224  11006 result/shell5.02
   540   2160  11007 result/shell5.03
   543   2172  10996 result/shell5.04
  5696  22376 110995 total


python

https://github.com/coreutils/coreutils/blob/master/src/split.c を再現しています

In [27]:
def get_chunk_ends(N, f):
    chunk_size = sum([len(x) for x in f]) // N
    chunk_ends = [chunk_size * (n + 1) - 1 for n in range(N)]
    return chunk_ends

def get_suffix_length(N):
    # https://github.com/coreutils/coreutils/blob/bb21daa125aeb4e32546309d370918ca47e612db/src/split.c#L165
    return max(2, len(str(N - 1)))

def split_file(N, filepath, outprefix):
    with open(filepath) as f:
        chunk_ends = get_chunk_ends(N, f)

    suffix_length = get_suffix_length(N)
    with open(filepath) as f:
        acc = 0
        for index, chunk_end in enumerate(chunk_ends):
            suffix = str(index).zfill(suffix_length)
            filename = outprefix + suffix
            with open(outprefix + suffix, 'w') as g:
                while acc < chunk_end:
                    line = f.readline()
                    acc += len(line)
                    g.write(line)
                    
split_file(5, 'data/popular-names.txt', 'result/python5.')

In [28]:
! diff result/python5.00 result/shell5.00
! diff result/python5.01 result/shell5.01
! diff result/python5.02 result/shell5.02
! diff result/python5.03 result/shell5.03
! diff result/python5.04 result/shell5.04

### 17. １列目の文字列の異なり

python

In [29]:
with open('data/popular-names.txt') as f:
    names = {line.split('\t')[0] for line in f}
names = list(names)
names.sort()

for name in names[:10]:
    print(name)

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


shell

In [30]:
! cut -f 1 data/popular-names.txt | sort -s | uniq | head

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


### 18. 各行を3コラム目の数値の降順にソート

python

In [31]:
with open('data/popular-names.txt') as f:
    lst = [line.strip() for line in f]
lst.sort(key = lambda x : -int(x.split('\t')[2]))

with open('result/python.18', 'w') as f:
    for line in lst:
        print(line, file=f)
    
for line in lst[:10]:
    print(line)

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954


shell

In [32]:
! sort -nrsk 3 data/popular-names.txt > result/shell.18
! head result/shell.18

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954


In [33]:
! diff result/python.18 result/shell.18

### 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる

python

In [34]:
from collections import Counter

In [35]:
counter = Counter()
with open('data/popular-names.txt') as f:
    for line in f:
        name = line.split('\t')[0]
        counter[name] += 1
        
lst = counter.most_common()
lst.sort(key=lambda x:(-x[1], x[0]))
        
with open('result/python.19', 'w') as f:
    for name, _ in lst:
        print(name, file=f)

In [36]:
! head result/python.19

James
William
John
Robert
Mary
Charles
Michael
Elizabeth
Joseph
Margaret


In [37]:
! tail result/python.19

Julie
Kelly
Laura
Lori
Lucas
Pamela
Rachel
Scott
Tracy
Walter


shell

In [38]:
! cut -f 1 data/popular-names.txt | sort | uniq -c | sort -nrsk1 | awk '{print $2}' > result/shell.19

In [39]:
! head result/shell.19

James
William
John
Robert
Mary
Charles
Michael
Elizabeth
Joseph
Margaret


In [40]:
! tail result/shell.19

Julie
Kelly
Laura
Lori
Lucas
Pamela
Rachel
Scott
Tracy
Walter


In [41]:
! diff result/python.19 result/shell.19