### 10. 行数のカウント

python

In [1]:
with open('data/popular-names.txt') as f:
    print(len(list(f)))

2780


shell

In [2]:
! wc -l < data/popular-names.txt

2780


### 11. タブをスペースに置換

python

In [3]:
with open('data/popular-names.txt') as f:
    for line in list(f)[:10]: # 本当はこんなことしないでfのままでいいんだけど，github上で閲覧するときにスクロールが面倒なので妥協
        line = line.strip()
        line = line.replace('\t', ' ')
        print(line)

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


shell

awkかperlを使っておけば大体いいと思っています

In [4]:
! awk '{gsub("\t", " ", $0); print $0}' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


In [5]:
! perl -pe 's/\t/ /g' data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
-p destination: Broken pipe


In [6]:
! sed 's/\t/ /g'  data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
sed: couldn't write 17 items to stdout: Broken pipe


In [7]:
! expand -t 1 data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880
expand: write error: Broken pipe
expand: write error


In [8]:
! tr '\t' ' ' < data/popular-names.txt | head

Mary F 7065 1880
Anna F 2604 1880
Emma F 2003 1880
Elizabeth F 1939 1880
Minnie F 1746 1880
Margaret F 1578 1880
Ida F 1472 1880
Alice F 1414 1880
Bertha F 1320 1880
Sarah F 1288 1880


### 12. 1列目をcol1.txtに，2列目をcol2.txtに保存

python

In [9]:
with open('data/popular-names.txt') as f, \
        open('result/col1.txt', 'w') as g, \
        open('result/col2.txt', 'w') as h:
    for line in f:
        line = line.strip()
        pref, city, _, _  = line.split('\t')
        print(pref, file=g)
        print(city, file=h)

In [10]:
! head result/col1.txt

Mary
Anna
Emma
Elizabeth
Minnie
Margaret
Ida
Alice
Bertha
Sarah


In [11]:
! head result/col2.txt

F
F
F
F
F
F
F
F
F
F


shell

In [12]:
! cut -f 1 data/popular-names.txt | head

Mary
Anna
Emma
Elizabeth
Minnie
Margaret
Ida
Alice
Bertha
Sarah
cut: write error: Broken pipe


In [13]:
! cut -f 2 data/popular-names.txt | head

F
F
F
F
F
F
F
F
F
F
cut: write error: Broken pipe


### 13. col1.txtとcol2.txtをマージ

python

In [14]:
from contextlib import ExitStack

In [15]:
files = ['result/col1.txt', 'result/col2.txt']
with ExitStack() as stack:
    files = [stack.enter_context(open(filename)) for filename in files]
    for lines in list(zip(*files))[:10]: # zip(*files)でいい
        x = [line.strip() for line in lines]
        x = '\t'.join(x)
        print(x)

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F


shell

In [16]:
! paste result/col1.txt result/col2.txt | head

Mary	F
Anna	F
Emma	F
Elizabeth	F
Minnie	F
Margaret	F
Ida	F
Alice	F
Bertha	F
Sarah	F
paste: write error: Broken pipe
paste: write error


### 14. 先頭からN行を出力

python

In [17]:
N = 5
with open('data/popular-names.txt') as f:
    lst = range(N)
    for _, line in zip(lst, f):
        print(line, end='')

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


shell

In [18]:
! head -n 5 data/popular-names.txt

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880
Elizabeth	F	1939	1880
Minnie	F	1746	1880


### 15. 末尾のN行を出力

python

In [19]:
from collections import deque

In [20]:
N = 5
queue = deque([], 5)
with open('data/popular-names.txt') as f:
    for line in f:
        queue.append(line)
for line in queue:
    print(line, end='')

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


shell

In [21]:
! tail -n 5 data/popular-names.txt

Benjamin	M	13381	2018
Elijah	M	12886	2018
Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


### 16. ファイルをN分割する

shell

In [22]:
! split -d -nl/5 data/popular-names.txt result/shell5.

In [23]:
! wc result/shell*

  587  2348 11007 result/shell5.00
  554  2216 11010 result/shell5.01
  556  2224 11006 result/shell5.02
  540  2160 11007 result/shell5.03
  543  2172 10996 result/shell5.04
 2780 11120 55026 total


python

In [24]:
def split_string_list(N, lst):
    chunk_size = sum([len(x) for x in lst]) // N
    chunk_ends = [chunk_size * (n + 1) - 1 for n in range(N)]
    
    i = 0
    acc = 0
    out = []
    for chunk_end in chunk_ends:
        tmp = []
        while acc < chunk_end:
            tmp.append(lst[i])
            acc += len(lst[i])
            i += 1
        out.append(tmp)
    return out

def split_file(N, filepath, outprefix):
    with open(filepath) as f:
        lst = list(f)
    lst = split_string_list(N, lst)
    for i, lines in enumerate(lst):
        idx = str(i).zfill(2) # 手抜き
        with open(outprefix + idx, 'w') as f:
            f.write(''.join(lines))

split_file(5, 'data/popular-names.txt', 'result/python5.')

In [25]:
! wc result/python*

  587  2348 11007 result/python5.00
  554  2216 11010 result/python5.01
  556  2224 11006 result/python5.02
  540  2160 11007 result/python5.03
  543  2172 10996 result/python5.04
 2780 11120 55026 total


### 17. １列目の文字列の異なり

python

In [26]:
names = set()
with open('data/popular-names.txt') as f:
    for line in f:
        name = line.split('\t')[0]
        names.add(name)
names = sorted(names)

for name in names[:10]:
    print(name)

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


shell

In [27]:
! cut -f 1 data/popular-names.txt | sort | uniq | head

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


### 18. 各行を3コラム目の数値の降順にソートPermalink

python

In [28]:
with open('data/popular-names.txt') as f:
    lst = [line.strip() for line in f]
lst.sort(key = lambda x : -int(x.split('\t')[2]))
for line in lst[:10]:
    print(line)

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954


shell

In [29]:
! sort -nrk 3 data/popular-names.txt | head

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947
Linda	F	91016	1949
Michael	M	90656	1956
Michael	M	90517	1958
James	M	88584	1948
Michael	M	88528	1954
sort: write failed: 'standard output': Broken pipe
sort: write error


### 19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる

python

In [30]:
from collections import Counter

In [31]:
cnt = Counter()
with open('data/popular-names.txt') as f:
    for line in f:
        name = line.split('\t')[0]
        cnt.update([name])
for name, num in cnt.most_common(10):
    print(num, name)

118 James
111 William
108 John
108 Robert
92 Mary
75 Charles
74 Michael
73 Elizabeth
70 Joseph
60 Margaret


shell

In [32]:
! cut -f 1 data/popular-names.txt | sort | uniq -c | sort -nrk1 | head

    118 James
    111 William
    108 Robert
    108 John
     92 Mary
     75 Charles
     74 Michael
     73 Elizabeth
     70 Joseph
     60 Margaret
