### term frequency inverse document frequency analysis

In [1]:
# Function to convert/split text to words
def text_to_words(text):
    import re
    # [12], [1], [123] <- Wiki citation char sequences
    # \[\d+\]
    # 's
    text = re.sub("\(|\)|\[\d+\]|('s)", "", text)
    # Match a single space " " OR ", " (comma and a space)
    split_pattern = " |, |-|\n|\."
    text = re.split(split_pattern, text)    
    # Only keep terms that are alphabetic characters
    text = [word for word in text if word.isalpha()]
    print(f"The text has been converted to {len(text)} words.")
    
    return text


In [2]:
def read_all_text(filename):
    with open(filename, encoding="utf-8") as f:
        print(f"\n\nReading contents of file: {filename}")
        return f.read().lower()

spacex_text = read_all_text("spacex.txt")
spacex_words = text_to_words(spacex_text)

# Get all the words for the blue origin and virgin galactic files
blue_origin_text = read_all_text("blue-origin.txt")
blue_origin_words = text_to_words(blue_origin_text)

virgin_galactic_text = read_all_text("virgin-galactic.txt")
virgin_galactic_words = text_to_words(virgin_galactic_text)




Reading contents of file: spacex.txt
The text has been converted to 5014 words.


Reading contents of file: blue-origin.txt
The text has been converted to 4875 words.


Reading contents of file: virgin-galactic.txt
The text has been converted to 2643 words.


# Lets do the TF for all terms in the spacex document

In [3]:
spacex_unique_words = list(set(spacex_words))
len(spacex_unique_words)

1211

In [4]:
spacex_unique_words_freq = {}
for word in spacex_unique_words:
    spacex_unique_words_freq[word] = spacex_words.count(word)

#spacex_unique_words_freq

## document frequency - is the word occuring in all the documents ??


In [5]:
# to make the document frequencies to see if the word appears in all 3 documents we checking 
specex_unique_words_doc_freq = {}

for word in spacex_unique_words:
    doc_count = 0
    if word in spacex_words: # we know it is cos we took it from there so we can change doc_count =1
        doc_count +=1        # and delete these two lines 
    if word in virgin_galactic_words:
        doc_count +=1 
    if word in blue_origin_words:
        doc_count +=1    
     
    specex_unique_words_doc_freq[word]= doc_count
    
# the word space appeared in all 3 documents 
print(f'The word "space" occured in : {specex_unique_words_doc_freq["lower"]} documents')   


The word "space" occured in : 2 documents


In [6]:
import pandas as pd 
pd.set_option('precision',4)

df_tf = pd.DataFrame(spacex_unique_words_freq.items(),columns = ['Word','Frequency'])
#df_tf.sort_values(by= 'Frequency',inplace = True,ascending = False,ignore_index = True)

df_tf.sample(5)

Unnamed: 0,Word,Frequency
948,subsidiary,1
130,based,1
70,covid,2
493,late,6
714,expected,3


In [7]:
# add term frequency column -> occurances over TOTAL num of words (including stop words )
df_tf['TF'] = df_tf['Frequency'] / len (spacex_words)


In [8]:
# add document frequency column -> is the word in all 3 document is it popular therefore not as special 
df_tf['DF'] = [specex_unique_words_doc_freq[x] for x in spacex_unique_words_freq ]
 


In [9]:
df_tf[df_tf['Word'] == 'throat']

Unnamed: 0,Word,Frequency,TF,DF
874,throat,1,0.0002,1


In [10]:
df_tf[df_tf['Word'] == 'the']

Unnamed: 0,Word,Frequency,TF,DF
281,the,321,0.064,3


## adding the last column for TF- IDF

In [14]:
# TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

# IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

import math

In [18]:
# looking at data frame values to calculate TF * DF
tf_idf_values =[]
num_docs_in_corpus =3

for row in df_tf.values:
    tf = row[2]
    df = row[3]
    tf_idf = tf*math.log(num_docs_in_corpus/df)
    
    tf_idf_values.append(tf_idf)
    
    
#add it to a dataframe 
df_tf['TF-IDF'] = tf_idf_values

df_tf.sort_values(by = 'TF-IDF',inplace = True, ascending = False )

df_tf.head(10)
    

Unnamed: 0,Word,Frequency,TF,DF,TF-IDF
830,dragon,41,0.0082,1,0.009
546,spacex,108,0.0215,2,0.0087
456,musk,34,0.0068,1,0.0074
362,falcon,59,0.0118,2,0.0048
1108,mars,19,0.0038,1,0.0042
295,international,16,0.0032,1,0.0035
198,starship,13,0.0026,1,0.0028
241,demo,12,0.0024,1,0.0026
1154,cargo,12,0.0024,1,0.0026
337,stage,30,0.006,2,0.0024


In [20]:
# Because word launch is in all 3 documents its loosing its value even tho it occurs 60times in a document

df_tf[df_tf['Word']=='launch']

Unnamed: 0,Word,Frequency,TF,DF,TF-IDF
39,launch,60,0.012,3,0.0
