In [1]:
# this bit of code uses the os.walk method from Python's os module to generate a list 
# of all the .txt files in the 'txt' folder
# os.walk returns the root directory of a folder, a list of all subfolders, 
# and a list of all files in the directory, including all files in its subdirectories 
# I then loop through the list of files and use the endsith method to verify I'm finding only text files
# I then append each text file name to the list called all_txt_files
# Finally, I return the length of all_txt_files to verify that I've found 366 file names
# This loop-and-append approach is very common in Python. You might even call it Pythonic.

import os
all_txt_files =[]
for root, dirs, files in os.walk("txt"):
    for file in files:
        if file.endswith(".txt"):
            all_txt_files.append(os.path.join(root, file))
n_files = len(all_txt_files)
all_txt_files[365]

'txt/0426.txt'

In [2]:
# the sort method ensures that the list of files is ordered by file name
# I then return all_txt_files[0] to verify that txt/0101.txt is the first item in the list
all_txt_files.sort()
all_txt_files[0]

'txt/0101.txt'

In [3]:
# In this bit of code, I do another loop-and-append 
# this time, I loop my list of file names and open each file. 
# I then use the read method to convert each text file to a string,
# and I append each string, one by one, to a new list called all_docs.
# Crucially, the string objects in the all_docs list implicitly have the order
# as the file names in the all_txt_files list 

all_docs = []
for i in all_txt_files:
    with open(i) as f:
        txt = f.read()
    all_docs.append(txt)

In [4]:
#import the TfidfVectorizer from scikit-learn.  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
vectorizer_counts = CountVectorizer()
X_counts = vectorizer_counts.fit_transform(all_docs)

# TfidfVectorizer is a class, so I instantiate it with specific pararmeters as 'vectorizer'
# I then run the object's fit_transform() method on my list of strings (all_docs)
# The stored variable X is output of the fit_transform() method 
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
X = vectorizer.fit_transform(all_docs)

In [6]:
a_counts = X_counts.toarray()
# The fit_transform() method converts the list of strings to a sparse matrix of TF-IDF values
# The toarray method converts a numpy array, which makes it easier to indpect every values including the zeros 
myarray = X.toarray()

In [7]:
# this line of code verifies that the numpy array represents the same number of 
# documents that we have in the file list
len(myarray)

366

In [26]:
from scipy import spatial
import numpy as np 

similarities = []
for text in myarray:
    score = 1 - spatial.distance.cosine(list(text), list(myarray[125]))
    if np.isnan(score):
        score = 0.0
    similarities.append(score)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [24]:
import pandas as pd

output_filenames = [i.replace(".txt", ".csv").replace("txt/", "tf_idf_output/") for i in all_txt_files]
df_sim = pd.DataFrame(similarities, columns=["cos_sim"])
df_sim['filenames'] = output_filenames 
df_sim = df_sim.sort_values(by="cos_sim", ascending=False).reset_index(drop=True)
df_sim

Unnamed: 0,cos_sim,filenames
0,1.000000,tf_idf_output/0505.csv
1,0.318625,tf_idf_output/0601.csv
2,0.307766,tf_idf_output/0124.csv
3,0.284789,tf_idf_output/0728.csv
4,0.279819,tf_idf_output/0829.csv
5,0.276453,tf_idf_output/1011.csv
6,0.273109,tf_idf_output/1027.csv
7,0.272643,tf_idf_output/0516.csv
8,0.272082,tf_idf_output/0323.csv
9,0.267439,tf_idf_output/0407.csv


In [29]:
#doc_0_feature_scores = list(zip(vectorizer.get_feature_names(), myarray[0]))

import pandas as pd
import os

# make the output folder if it doesn't already exist
if not os.path.exists("tf_idf_output"):
    os.makedirs("tf_idf_output")

# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [i.replace(".txt", ".csv").replace("txt/", "tf_idf_output/") for i in all_txt_files]

# loop each item in myarray, using enumerate to keep track of the current position
for n, doc in enumerate(myarray):
    # construct a dataframe
    data = list(zip(vectorizer.get_feature_names(), doc))
    df = pd.DataFrame.from_records(data, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv using the enumerated value for the filename
    df.to_csv(output_filenames[n])

In [27]:
import pandas as pd
terms_count = vectorizer_counts.get_feature_names()

dfs_count = []
for c in range(len(a_counts)):
    data_counts = list(zip(terms_count, a_counts[c,:]))
    df_count = pd.DataFrame.from_records(data_counts, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    df_count = df_count.loc[df_count['score'] > 0]
    dfs_count.append(df_count)
#
#
#
terms = vectorizer.get_feature_names()

dfs = []
for c in range(len(a)):
    data = list(zip(terms, a[c,:]))
    df = pd.DataFrame.from_records(data, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    df = df.loc[df['score'] > 0]
    dfs.append(df)


In [28]:
names = ["dubois", "bly", "sinclair", "tarbell", "cather"]
result = []
indices = [53, 125, 263, 309, 341]
for e,i in enumerate(dfs):
    if e in indices:
        result.append(i)
result_count = []
for e,i in enumerate(dfs_count):
    if e in indices:
        result_count.append(i)

In [24]:
#for e, i in enumerate(result):
#    i.to_csv(names[e]+'.csv')
for e, i in enumerate(result_count):
    i.to_csv(names[e]+'_count.csv')

In [None]:
#0505 bly = 125
#0920 sinclair = 263
#0223 dubois = 53
#1207 cather = 341
#1105 tarbell = 309

In [16]:
dfs[188].to_csv("unknown_obit.csv")

In [25]:
dfs[125].to_csv("bly_no_norm.csv")

In [30]:
# to do tf-df by hand
# terms and counts
# df of each term


In [20]:
list(vectorizer.idf_)

[5.806749559386461,
 1.5582543173371017,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 5.51906748693468,
 6.212214667494625,
 4.959451698999257,
 5.806749559386461,
 6.212214667494625,
 6.212214667494625,
 5.806749559386461,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 5.29592393562047,
 6.212214667494625,
 6.212214667494625,
 6.212214667494625,
 1.775463133131497,
 2.270632859824935,
 5.29592393562047,
 5.113602378826515,
 6.212214667494625,
 5.29592393562047,
 5.806749559386461,
 6.212214667494625,
 5.113602378826515,
 6.212214667494625,
 5.51906748693468,
 6.212214667494625,
 5.29592393562047,
 6.212214667494625,
 5.113602378826515,
 6.212214667494625,
 4.959451698999257,
 2.204881482262154,
 4.602776755060525,
 5.29592393562047,
 5.806749559386461,
 5.806749559386461,
 5.806749559386461,
 6.212214667494625,
 5.51906748693468,
 6.2122

In [21]:
vectorizer.vocabulary_['afternoon']


1480

In [22]:
vectorizer.idf_[1480]

2.700669228663604

In [49]:
bly_terms = list(dfs_count[125]['term'])

In [50]:
bly_doc_counts = []
for t in bly_terms:
    bly_term_count = 0
    for d in dfs_count:
        doc_dict = dict(list(zip(list(d['term']), list(d['score']))))
        try:
            c = doc_dict[t]
            bly_term_count +=1
        except:
            pass
    bly_doc_counts.append(bly_term_count)

In [51]:
df_idf_bly = pd.DataFrame.from_records(list(zip(bly_terms, bly_doc_counts)), columns=['term', 'count'])

In [54]:
df_idf_bly = df_idf_bly.sort_values(by="term")
df_idf_bly.to_csv("bly_idf.csv")

In [60]:
all_docs[0][39:300]

' The brushed-back, gray-brown hair was straight and thin--not the wiry dark curls of a few year ago. He walked stiffly, although his figure was trim and erect. Behind his glasses, his dark brown eyes looked fixed, and he seemed to be daydreaming. At the age of '

In [61]:
a = 'The once ruddy face was puffy and pale'
b = 'The gray hair was straight and thin'
c = 'His dark brown eyes looked fixed, and he seemed to be daydreaming'
d = 'his figure was trim and erect'
e = [a,b,c,d]

vectorizer_test = CountVectorizer()
X_test = vectorizer_test.fit_transform(e)

# TfidfVectorizer is a class, so I instantiate it with specific pararmeters as 'vectorizer'
# I then run the object's fit_transform() method on my list of strings (all_docs)
# The stored variable X is output of the fit_transform() method 
vectorizer_test_tfidf = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
X_test_tfidf = vectorizer_test_tfidf.fit_transform(e)


# The fit_transform() method converts the list of strings to a sparse matrix of TF-IDF values
# The toarray method converts a numpy array, which makes it easier to indpect every values including the zeros 
a_test = X_test.toarray()
a_test_tfidf = X_test_tfidf.toarray()

In [70]:
list(zip(vectorizer_test_tfidf.get_feature_names(), a_test_tfidf[0]))

[(u'be', 0.0),
 (u'brown', 0.0),
 (u'dark', 0.0),
 (u'daydreaming', 0.0),
 (u'erect', 0.0),
 (u'eyes', 0.0),
 (u'face', 1.916290731874155),
 (u'figure', 0.0),
 (u'fixed', 0.0),
 (u'gray', 0.0),
 (u'hair', 0.0),
 (u'he', 0.0),
 (u'his', 0.0),
 (u'looked', 0.0),
 (u'once', 1.916290731874155),
 (u'pale', 1.916290731874155),
 (u'puffy', 1.916290731874155),
 (u'ruddy', 1.916290731874155),
 (u'seemed', 0.0),
 (u'straight', 0.0),
 (u'the', 1.5108256237659907),
 (u'thin', 0.0),
 (u'to', 0.0),
 (u'trim', 0.0)]

In [71]:
list(zip(vectorizer_test.get_feature_names(), a_test[0]))

[(u'and', 1),
 (u'be', 0),
 (u'brown', 0),
 (u'dark', 0),
 (u'daydreaming', 0),
 (u'erect', 0),
 (u'eyes', 0),
 (u'face', 1),
 (u'figure', 0),
 (u'fixed', 0),
 (u'gray', 0),
 (u'hair', 0),
 (u'he', 0),
 (u'his', 0),
 (u'looked', 0),
 (u'once', 1),
 (u'pale', 1),
 (u'puffy', 1),
 (u'ruddy', 1),
 (u'seemed', 0),
 (u'straight', 0),
 (u'the', 1),
 (u'thin', 0),
 (u'to', 0),
 (u'trim', 0),
 (u'was', 1)]

In [92]:
import math
#n = 4.0
#df = 1.0
#tf = 1.0
n = 366.0
df = 66.0
tf = 1.0

# log [ (1 + n) / (1 + df(d, t)) ] + 1

math.log((1+n)/(1+df)) + 1

2.700669228663604

In [76]:
1+math.log(1)

1.0

In [98]:
t = 367.0/67.0
math.log(t)
t

5.477611940298507

In [27]:
names = ["dubois", "bly", "sinclair", "tarbell", "cather"]
result = []
indices = [53, 125, 263, 309, 341]

for i in indices:
    print(output_filenames[i])
    

tf_idf_output/0223.csv
tf_idf_output/0505.csv
tf_idf_output/0920.csv
tf_idf_output/1105.csv
tf_idf_output/1207.csv
