# 03_02: Loading Text Files

In [40]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [41]:
# iterating over an open file yields its lines, one by one

words = []
for line in open('words.txt', 'r'):
    words.append(line)

In [42]:
len(words)

235886

In [43]:
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [44]:
'Aaron\n'.strip()

'Aaron'

In [45]:
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [46]:
'Aaron\n'.strip().lower()

'aaron'

In [47]:
words = []
for line in open('words.txt', 'r'):
    words.append(line.strip().lower())

In [48]:
words[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [49]:
words = set()
for line in open('words.txt', 'r'):
    words.add(line.strip().lower())

In [50]:
# a set comprehension that collects stripped and lowercased lines...
words = {line.strip().lower() for line in open('words.txt', 'r')}

In [52]:
words

{'vogul',
 'giddyhead',
 'clodhead',
 'impropriation',
 'graphy',
 'reapparition',
 'jo',
 'quinnipiac',
 'icebound',
 'privateersman',
 'mellay',
 'freelovism',
 'microcentrum',
 'philodemic',
 'akrabattine',
 'plenipotentiarily',
 'negligee',
 'germanic',
 'hyponoetic',
 'queasiness',
 'troytown',
 'equinely',
 'clubfellow',
 'maritality',
 'paragraphically',
 'nonbuying',
 'musrol',
 'unisomorphic',
 'tautozonality',
 'centrist',
 'diphysite',
 'disarticulation',
 'treen',
 'benedight',
 'absconce',
 'abobra',
 'knighthead',
 'rehybridize',
 'maskflower',
 'supperless',
 'primulaveroside',
 'misotyranny',
 'flectionless',
 'scintillometer',
 'clitellum',
 'joachim',
 'enophthalmus',
 'underdrawn',
 'gibing',
 'coadjacent',
 'narrater',
 'quinitol',
 'smirch',
 'runnet',
 'discocephalous',
 'proofer',
 'tearably',
 'palmful',
 'smoos',
 'arthrotyphoid',
 'extraserous',
 'extramorainic',
 'bindle',
 'lifelike',
 'interrupted',
 'bipersonal',
 'anaclasis',
 'homogeny',
 'insweeping',
 

In [64]:
# ...turned into a sorted list
words = sorted({line.strip().lower() for line in open('words.txt', 'r')})

In [65]:
words

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate',
 'a

In [69]:
# return a list of all lines from an open file 
open('francais.txt', 'r', encoding='latin1').readlines()

['\n',
 'a\n',
 'ab\n',
 'abaissa\n',
 'abaissai\n',
 'abaissaient\n',
 'abaissais\n',
 'abaissait\n',
 'abaissant\n',
 'abaissas\n',
 'abaissasse\n',
 'abaissassent\n',
 'abaissasses\n',
 'abaissassiez\n',
 'abaissassions\n',
 'abaissâmes\n',
 'abaissât\n',
 'abaissâtes\n',
 'abaisse\n',
 'abaissement\n',
 'abaissements\n',
 'abaissent\n',
 'abaisser\n',
 'abaissera\n',
 'abaisserai\n',
 'abaisseraient\n',
 'abaisserais\n',
 'abaisserait\n',
 'abaisseras\n',
 'abaisserez\n',
 'abaisseriez\n',
 'abaisserions\n',
 'abaisserons\n',
 'abaisseront\n',
 'abaisses\n',
 'abaisseur\n',
 'abaisseurs\n',
 'abaissez\n',
 'abaissé\n',
 'abaissée\n',
 'abaissées\n',
 'abaissés\n',
 'abaissèrent\n',
 'abaissiez\n',
 'abaissions\n',
 'abaissons\n',
 'abandon\n',
 'abandonna\n',
 'abandonnai\n',
 'abandonnaient\n',
 'abandonnais\n',
 'abandonnait\n',
 'abandonnant\n',
 'abandonnas\n',
 'abandonnasse\n',
 'abandonnassent\n',
 'abandonnasses\n',
 'abandonnassiez\n',
 'abandonnassions\n',
 'abandonnâmes\

In [83]:
formatted_words = {line.strip() for line in open('francais.txt', 'r', encoding='latin1')}

In [84]:
sorted(formatted_words)

['',
 'a',
 'ab',
 'abaissa',
 'abaissai',
 'abaissaient',
 'abaissais',
 'abaissait',
 'abaissant',
 'abaissas',
 'abaissasse',
 'abaissassent',
 'abaissasses',
 'abaissassiez',
 'abaissassions',
 'abaisse',
 'abaissement',
 'abaissements',
 'abaissent',
 'abaisser',
 'abaissera',
 'abaisserai',
 'abaisseraient',
 'abaisserais',
 'abaisserait',
 'abaisseras',
 'abaisserez',
 'abaisseriez',
 'abaisserions',
 'abaisserons',
 'abaisseront',
 'abaisses',
 'abaisseur',
 'abaisseurs',
 'abaissez',
 'abaissiez',
 'abaissions',
 'abaissons',
 'abaissâmes',
 'abaissât',
 'abaissâtes',
 'abaissèrent',
 'abaissé',
 'abaissée',
 'abaissées',
 'abaissés',
 'abandon',
 'abandonna',
 'abandonnai',
 'abandonnaient',
 'abandonnais',
 'abandonnait',
 'abandonnant',
 'abandonnas',
 'abandonnasse',
 'abandonnassent',
 'abandonnasses',
 'abandonnassiez',
 'abandonnassions',
 'abandonne',
 'abandonnent',
 'abandonner',
 'abandonnera',
 'abandonnerai',
 'abandonneraient',
 'abandonnerais',
 'abandonnerait',

In [85]:
len(formatted_words)

208914