# 第 2 章: UNIX コマンド


In [6]:
import functools as ft
import subprocess

def run(*cmds) -> str:
    output = subprocess.run(cmds, capture_output=True)
    return output.stdout.decode()

### 10. 行数のカウント


In [7]:
ret1 = 0
with open("./data/popular-names.txt", "r") as f:
    ret1 = len(f.readlines())
print("python: \t", ret1)

ret1_cm = run("wc", "-l", "<", "data/popular-names.txt")
ret1_cm = ret1_cm.strip().split()[0]
print("UNIX commnad: \t", ret1_cm)
!wc -l data/popular-names.txt

python: 	 2780
UNIX commnad: 	 2780
    2780 data/popular-names.txt


### 11. タブをスペースに置換


In [13]:
ret2 = ""
with open("./data/popular-names.txt", "r") as f:
    ret2 = ft.reduce(lambda ret, s: ret + s.replace("\t", " "), f.readlines(), "")

ret2_cm = run("sed", "s/\\t/ /g", "./data/popular-names.txt")

print(ret2 == ret2_cm)

True


### 12. 1 列目を col1.txt に，2 列目を col2.txt に保存


In [71]:
with open("./data/popular-names.txt", "r") as inputfile, \
     open("./data/col1.txt", "w") as col1file, \
     open("./data/col2.txt", "w") as col2file:
     for i, line in enumerate(inputfile.readlines()):
        col1, col2, *_ = line.split("\t")
        col1file.write(col1 + "\n")
        col2file.write(col2 + "\n")

# _ = run("cut", "-f1", "./data/popular-names.txt" , ">", "./data/col1_cm.txt")
!cut -f1 ./data/popular-names.txt > ./data/col1_cm.txt
!cut -f2 ./data/popular-names.txt > ./data/col2_cm.txt

!diff data/col1.txt data/col1_cm.txt
!diff data/col2.txt data/col2_cm.txt

### 13. col1.txt と col2.txt をマージ


In [77]:
with open("./data/merged.txt", "w") as mergedfile, \
     open("./data/col1.txt", "r") as col1file, \
     open("./data/col2.txt", "r") as col2file:
        col1lines = col1file.readlines()
        col2lines = col2file.readlines()
        for col1, col2 in zip(col1lines, col2lines):
            mergedfile.write(f"{col1[:-1]}\t{col2[:-1]}\n")

!paste data/col1_cm.txt data/col2_cm.txt > data/merged_cm.txt
!diff data/merged.txt data/merged_cm.txt

### 14. 先頭から N 行を出力


In [81]:
N = 3
with open("./data/popular-names.txt", "r") as f:
    print("".join(f.readlines()[:N]))
!head -n3 data/popular-names.txt

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880

Mary	F	7065	1880
Anna	F	2604	1880
Emma	F	2003	1880


### 15. 末尾の N 行を出力


In [82]:
N = 3
with open("./data/popular-names.txt", "r") as f:
    print("".join(f.readlines()[-N:]))
!tail -n3 data/popular-names.txt

Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018

Lucas	M	12585	2018
Mason	M	12435	2018
Logan	M	12352	2018


### 16. ファイルを N 分割する


In [20]:
import math
N = 3
chunks = []

with open("./data/popular-names.txt", "r") as f:
    lines = f.readlines()
    nlines = len(lines)
    chunksize = math.ceil(nlines / N)
    chunks = [lines[i:i + chunksize] for i in range(0, nlines, chunksize)]

for i, chunk in enumerate(chunks):
    outputfile = open(f"data/chunk{i+1}.txt", "w")
    try:
        outputfile.writelines(chunk)
    finally:
        outputfile.close()

!split -n 3 data/popular-names.txt "data/chunk"

In [22]:
!diff data/chunk1.txt data/chunkaa

927a928,957
> Virginia	F	16162	1926
> Mildred	F	13551	1926
> Frances	F	13355	1926
> Robert	M	61130	1926
> John	M	56110	1926
> James	M	53209	1926
> William	M	51920	1926
> Charles	M	29521	1926
> George	M	25904	1926
> Richard	M	25149	1926
> Joseph	M	23759	1926
> Donald	M	20912	1926
> Edward	M	19378	1926
> Mary	F	70639	1927
> Dorothy	F	35987	1927
> Betty	F	35422	1927
> Helen	F	25320	1927
> Margaret	F	21964	1927
> Ruth	F	19414	1927
> Doris	F	16510	1927
> Virginia	F	15725	1927
> Shirley	F	13315	1927
> Barbara	F	13161	1927
> Robert	M	61669	1927
> John	M	55954	1927
> James	M	53674	1927
> William	M	51482	1927
> Charles	M	31936	1927
> Richard	M	26771	1927
> George	M	25949	1927
\ No newline at end of file


### 17. １列目の文字列の異なり


In [29]:
with open("./data/popular-names.txt", "r") as f:
    print(
        "\n".join(
            sorted(list(set(line.split("\t")[0] for line in f.readlines())))[:10]
        )
    )

!cut -f1 data/popular-names.txt | sort | uniq | head -n10

Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela
Abigail
Aiden
Alexander
Alexis
Alice
Amanda
Amelia
Amy
Andrew
Angela


### 18. 各行を 3 コラム目の数値の降順にソート


In [40]:
with open("./data/popular-names.txt", "r") as f:
    ret18 = sorted(f.readlines(), key=lambda line: -int(line.split("\t")[2]))
    print("".join(ret18[:5]))

!sort -rk3 data/popular-names.txt | head -n5

Linda	F	99689	1947
Linda	F	96211	1948
James	M	94757	1947
Michael	M	92704	1957
Robert	M	91640	1947

Linda	F	99689	1947
James	M	9951	1911
Mildred	F	9921	1913
Mary	F	9889	1886
Mary	F	9888	1887
sort: Broken pipe


In [30]:
from collections import defaultdict

freqs = defaultdict(lambda: 0)
with open("./data/popular-names.txt", "r") as f:
    names = [line.split("\t")[0] for line in f.readlines()]
    for name in names:
        freqs[name] += 1
    ret19 = [name for name, _ in sorted(freqs.items(), key=lambda item: -item[1])]
    print("\n".join(ret19[:5]))

!cut -f1 data/popular-names.txt | sort | uniq -c | sort -r | head -n5
# !echo $0

James
William
John
Robert
Mary
 118 James
 111 William
 108 Robert
 108 John
  92 Mary
