## Loading Text Files

In [1]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [2]:
# reading the lines in the text file one by one the "r" mode
words = []
for line in open('words.txt', 'r'):
    words.append(line)

In [3]:
len(words)

235886

In [5]:
words[:15]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n',
 'Aaronic\n',
 'Aaronical\n',
 'Aaronite\n',
 'Aaronitic\n',
 'Aaru\n']

#### Notice that the "new line" shown as \n... to fix it we use strip()

In [6]:
'Aaron\n'.strip()

'Aaron'

#### Upper and Lower Case

In [7]:
'Aaron\n'.strip().lower()

'aaron'

Now applying the trick words text file--strip and lower case all lines

In [10]:
words = []

for line in open('words.txt', 'r'):
    words.append(line.strip().lower())

In [11]:
words[:15]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru']

#### Wrapping Text Inform in A Set--"add"

In [14]:
words = set()
for line in open('words.txt', 'r'):
    words.add(line.strip().lower())

In [13]:
words

{'alodian',
 'poppycock',
 'heracleonite',
 'pancreas',
 'unpecuniarily',
 'acquaintancy',
 'episepalous',
 'pontooner',
 'overinclinable',
 'oxalylurea',
 'dartman',
 'epikouros',
 'salvably',
 'downwith',
 'gunnel',
 'odobenidae',
 'morassweed',
 'seafardinger',
 'quizzically',
 'vote',
 'emerald',
 'comatose',
 'freemasonic',
 'stagger',
 'mouthroot',
 'chitinized',
 'pinacolate',
 'dispermous',
 'noilage',
 'orant',
 'culprit',
 'photoaquatint',
 'typholysin',
 'enteromere',
 'decalcomania',
 'spiriform',
 'trigraph',
 'hysterorrhexis',
 'prehensory',
 'echometer',
 'unchosen',
 'ultraeducationist',
 'executable',
 'gephyrocercal',
 'ternal',
 'tripletail',
 'pompilid',
 'chrysocolla',
 'struggling',
 'staphylinidae',
 'naphthalate',
 'browache',
 'underworking',
 'commonplaceness',
 'cicadellidae',
 'makluk',
 'oleosity',
 'archaeopteryx',
 'commotion',
 'cobitis',
 'cymbiform',
 'figurately',
 'unarched',
 'katipuneros',
 'daze',
 'pseudosocialistic',
 'albinism',
 'psorospermial

In [15]:
# alternatively:
words_comprehsion = {line.strip().lower() for line in open('words.txt', 'r')}

In [16]:
words_comprehsion

{'alodian',
 'poppycock',
 'heracleonite',
 'pancreas',
 'unpecuniarily',
 'acquaintancy',
 'episepalous',
 'pontooner',
 'overinclinable',
 'oxalylurea',
 'dartman',
 'epikouros',
 'salvably',
 'downwith',
 'gunnel',
 'odobenidae',
 'morassweed',
 'seafardinger',
 'quizzically',
 'vote',
 'emerald',
 'comatose',
 'freemasonic',
 'stagger',
 'mouthroot',
 'chitinized',
 'pinacolate',
 'dispermous',
 'noilage',
 'orant',
 'culprit',
 'photoaquatint',
 'typholysin',
 'enteromere',
 'decalcomania',
 'spiriform',
 'trigraph',
 'hysterorrhexis',
 'prehensory',
 'echometer',
 'unchosen',
 'ultraeducationist',
 'executable',
 'gephyrocercal',
 'ternal',
 'tripletail',
 'pompilid',
 'chrysocolla',
 'struggling',
 'staphylinidae',
 'naphthalate',
 'browache',
 'underworking',
 'commonplaceness',
 'cicadellidae',
 'makluk',
 'oleosity',
 'archaeopteryx',
 'commotion',
 'cobitis',
 'cymbiform',
 'figurately',
 'unarched',
 'katipuneros',
 'daze',
 'pseudosocialistic',
 'albinism',
 'psorospermial

In [17]:
# alphabetical order
words_comprehsion = sorted({line.strip().lower() for line in open('words.txt', 'r')})

In [18]:
words_comprehsion

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate',
 'a

#### To Read Other Languages

In [21]:
#Say French
open('francais.txt', 'r', encoding = 'latin').readlines()

['\n',
 'a\n',
 'ab\n',
 'abaissa\n',
 'abaissai\n',
 'abaissaient\n',
 'abaissais\n',
 'abaissait\n',
 'abaissant\n',
 'abaissas\n',
 'abaissasse\n',
 'abaissassent\n',
 'abaissasses\n',
 'abaissassiez\n',
 'abaissassions\n',
 'abaissâmes\n',
 'abaissât\n',
 'abaissâtes\n',
 'abaisse\n',
 'abaissement\n',
 'abaissements\n',
 'abaissent\n',
 'abaisser\n',
 'abaissera\n',
 'abaisserai\n',
 'abaisseraient\n',
 'abaisserais\n',
 'abaisserait\n',
 'abaisseras\n',
 'abaisserez\n',
 'abaisseriez\n',
 'abaisserions\n',
 'abaisserons\n',
 'abaisseront\n',
 'abaisses\n',
 'abaisseur\n',
 'abaisseurs\n',
 'abaissez\n',
 'abaissé\n',
 'abaissée\n',
 'abaissées\n',
 'abaissés\n',
 'abaissèrent\n',
 'abaissiez\n',
 'abaissions\n',
 'abaissons\n',
 'abandon\n',
 'abandonna\n',
 'abandonnai\n',
 'abandonnaient\n',
 'abandonnais\n',
 'abandonnait\n',
 'abandonnant\n',
 'abandonnas\n',
 'abandonnasse\n',
 'abandonnassent\n',
 'abandonnasses\n',
 'abandonnassiez\n',
 'abandonnassions\n',
 'abandonnâmes\

and we'll be able to work from here onwards.