In [17]:
import os
import pandas as pd
from itertools import groupby
from operator import itemgetter
import re

from nltk import regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
import nltk
from nltk.corpus import stopwords
from string import punctuation

import datetime

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

In [10]:
# a function to read files 
def read_file(file):
    with open(file, 'r', encoding="utf-8") as f:
        text = f.read()
    return text

# set path to text files
file_dir = './Data/Text_Files/'
text_list = []

# find files with .txt and read them
for file in os.listdir(file_dir):
    if file.endswith(".txt"):
        file_path = f'{file_dir}{file}'
        text_list.append(read_file(file_path))

In [13]:
print(f"number of texts: {len(text_list)}")

number of texts: 82


In [14]:
# sample text
print(f"sample text: {text_list[0][:500]}")

sample text: 
Mr. President, Mr. Speaker, Members of the 85th Congress:
It is again my high privilege to extend personal greetings to the members of the 85th Congress.
All of us realize that, as this new session begins, many Americans are troubled about recent world developments which they believe may threaten our nation's safety. Honest men differ in their appraisal of America's material and intellectual strength, and the dangers that confront us. But all know these dangers are real.
The purpose of this mes


In [16]:
df = pd.read_csv("./Data/metadata.csv")
df.head()

Unnamed: 0,Index,President,Categories,Date,Citation,Title
0,0,Harry S. Truman,"['Presidential', 'State of the Union Addresses...","January 06, 1947","Harry S. Truman, Annual Message to the Congres...",Annual Message to the Congress on the State of...
1,1,Harry S. Truman,"['Presidential', 'State of the Union Addresses...","January 07, 1948","Harry S. Truman, Annual Message to the Congres...",Annual Message to the Congress on the State of...
2,2,Harry S. Truman,"['Presidential', 'State of the Union Addresses...","January 05, 1949","Harry S. Truman, Annual Message to the Congres...",Annual Message to the Congress on the State of...
3,3,Harry S. Truman,"['Presidential', 'State of the Union Addresses...","January 04, 1950","Harry S. Truman, Annual Message to the Congres...",Annual Message to the Congress on the State of...
4,4,Harry S. Truman,"['Presidential', 'State of the Union Addresses...","January 08, 1951","Harry S. Truman, Annual Message to the Congres...",Annual Message to the Congress on the State of...


#### Change Date Type to datetime

In [18]:
print(df['Title'].dtypes)

object


In [19]:
df['Date'] = pd.to_datetime(df['Date'], format='%B %d, %Y')
df['Date']

0    1947-01-06
1    1948-01-07
2    1949-01-05
3    1950-01-04
4    1951-01-08
        ...    
77   2016-01-12
78   2018-01-30
79   2019-02-05
80   2020-02-04
81   2022-03-01
Name: Date, Length: 82, dtype: datetime64[ns]

#### Groupby persidents:
`dict[president_name]` = `[texts]`


In [None]:
president_name = df['President'].tolist()
president_text_dict = {keys: [i for _, i in sub] for keys, sub in groupby(
         zip(president_name, text_list), key = itemgetter(0))}

In [None]:
president_text_dict.keys()

dict_keys(['Harry S. Truman', 'Dwight D. Eisenhower', 'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon', 'Gerald R. Ford', 'Jimmy Carter', 'Ronald Reagan', 'George Bush', 'William J. Clinton', 'George W. Bush', 'Barack Obama', 'Donald J. Trump', 'Joseph R. Biden'])

In [None]:
len(president_text_dict['Barack Obama'])

7

In [None]:
# http://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer
pattern = r'''(?x)          # set flag to allow verbose regexps
        (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
      | \w+(?:-\w+)*        # words with optional internal hyphens
      | \$?\d+(?:\.\d+)?%?\s?  # currency and percentages, e.g. $12.40, 82%
      | \.\.\.              # ellipsis
      | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
    '''

pattern = re.compile(pattern)

def tokenize_text(text):
    return regexp_tokenize(text, pattern)

In [None]:
def lemmatize_all(sentence):
    l = []
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(tokenize_text(sentence)):
        if tag.startswith('N'):
            l.append(wnl.lemmatize(word, pos='n'))
        elif tag.startswith('V'):
            l.append(wnl.lemmatize(word, pos='v'))
        elif tag.startswith('J'):
            l.append(wnl.lemmatize(word, pos='a'))
        elif tag.startswith('R'):
            l.append(wnl.lemmatize(word, pos='r'))
            
        else:
            l.append(wnl.lemmatize(word, pos='n'))
    return l

In [None]:
sentence_sample = 'The striped bats are  hanging on their feet for best and wrote you are'
print(lemmatize_all(sentence_sample))



['The', 'striped', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best', 'and', 'write', 'you', 'be']


In [None]:
stop_words = set(stopwords.words('english'))
def normalize_and_cleaning(text, remove_punct, lower, stop_word, remove_number, min_len):
    
    word_tokens = tokenize_text(text)
    
    if remove_punct:
        out =' '.join([word for word in word_tokens if word not in string.punctuation])

        
    if lower:
        text = text.lower()

    
    if remove_number:
        my_reg = r'\$?\d+\.?\d*\w*\d*'
        text = re.sub(my_reg, '', t)
    
        
    if stop_word:
        text = [w for w in word_tokens if not w.lower() in stop_words]
        
    
    text = re.sub(' +', ' ', text)

    
    

In [None]:
l = tokenize_text('That U.S.A. poster-print e.g the U.S costs $12.40 amir. hello')
t = 'That U.S.A. poster-print e.g the U.S costs $12.40 amir. hello'
def punct(text):
    text = ' '.join([t for t in tokenize_text(text) if len(t) > 1])
    text = ' '.join(word.strip(punctuation) for word in tokenize_text(text))
    return text
        
#punct(t) 
import re, string

def test(text):
    out =' '.join([word for word in tokenize_text(text) if word not in string.punctuation])

    return out
print(test(t))
print(punct(t))
#print(l)

That U.S.A. poster-print e.g the U.S costs $12.40  amir hello
That U.S.A poster-print e.g the U.S costs 12.40  amir hello


In [None]:
l = 'That U.S.A. poster-print e.g the U.S costs $12.40 3th amir. hello 7. 9 2th 2nd2 5-year $56 $4 1890'

def remove_num(t):
    my_reg = r'\$?\d+\.?\d*\w*\d*'
    text = re.sub(my_reg, '', t)
    text = re.sub(' +', ' ', text)
    return text

y = remove_num(l)
print(test(y))


That U.S.A. poster-print e.g the U.S costs amir hello year


In [None]:
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '