In [1]:
import numpy as np
import pandas as pd

addresses = pd.read_csv("https://assets.datacamp.com/production/repositories/3752/datasets/cdc15798dd6698003ee33c6af185242faf896187/inaugural_speeches.csv")

In [5]:
speech_df = addresses

# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()

# Print the first 5 rows of the text_clean column
print(speech_df['text_clean'].head())

0    fellow-citizens of the senate and of the house...
1    fellow citizens:  i am again called upon by th...
2    when it was first perceived, in early times, t...
3    friends and fellow-citizens:  called upon to u...
4    proceeding, fellow-citizens, to that qualifica...
Name: text_clean, dtype: object


In [11]:
# Find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()

# Count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

# Find the average length of word
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

# Print the first 5 rows of these columns
print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])

                                           text_clean  char_cnt  word_cnt  \
0   fellow-citizens of the senate and of the house...      8616      1427   
1   fellow citizens:  i am again called upon by th...       787       135   
2   when it was first perceived, in early times, t...     13871      2317   
3   friends and fellow-citizens:  called upon to u...     10144      1717   
4   proceeding, fellow-citizens, to that qualifica...     12902      2157   
5   unwilling to depart from examples of the most ...      7003      1173   
6   about to add the solemnity of an oath to the o...      7148      1210   
7   i should be destitute of feeling if i was not ...     19894      3367   
8   fellow-citizens:  i shall not attempt to descr...     26322      4462   
9   in compliance with an usage coeval with the ex...     17753      2907   
10  fellow-citizens:  about to undertake the arduo...      6818      1124   
11  fellow-citizens:  the will of the american peo...      7061      1171   

In [33]:
# Word counts in Python

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df = 0.1, max_df = 0.9)

cv.fit(speech_df['text_clean'])
cv_transformed = cv.transform(speech_df['text_clean'])
cv_transformed.toarray()

# get a list of the features generated
feature_names = cv.get_feature_names_out()
print(feature_names)


['0092' '0097' 'abandon' ... 'your' 'zeal' 'zealously']


In [27]:
# Instantiate CountVectorizer
cv = CountVectorizer()

# Fit the vectorizer
cv.fit(speech_df['text_clean'])

# Print feature names
print(cv.get_feature_names_out())

['0085' '0092' '0093' ... 'zealous' 'zealously' 'zone']


In [35]:
# Apply the vectorizer
cv_transformed = cv.transform(speech_df['text_clean'])

# Print the full array
cv_array = cv_transformed.toarray()
print(cv_array)

[[ 0  1  0 ...  9  0  0]
 [ 0  0  0 ...  1  0  0]
 [ 0  1  0 ...  1  1  0]
 ...
 [12 23  0 ...  3  0  0]
 [12 22  0 ...  0  0  0]
 [10  7  0 ... 11  0  0]]


In [37]:
print(cv_array.shape)

(58, 1932)


In [39]:
# limiting our features

# Specify arguements to limit the number of features generated
cv = CountVectorizer(min_df = 0.2, max_df = 0.8)

# Fit, transform, and convert into array
cv_transformed = cv.fit_transform(speech_df['text_clean'])
cv_array = cv_transformed.toarray()

# Print the array shape
print(cv_array.shape)

(58, 818)


In [41]:
# Creating a dataframe for use in further analysis

# Create a DataFrame with these features
cv_df = pd.DataFrame(cv_array, 
                     columns=cv.get_feature_names_out()).add_prefix('Counts_')

# Add the new columns to the original DataFrame
speech_df_new = pd.concat([speech_df, cv_df], axis=1, sort=False)
print(speech_df_new.head())

                Name         Inaugural Address                      Date  \
0  George Washington   First Inaugural Address  Thursday, April 30, 1789   
1  George Washington  Second Inaugural Address     Monday, March 4, 1793   
2         John Adams         Inaugural Address   Saturday, March 4, 1797   
3   Thomas Jefferson   First Inaugural Address  Wednesday, March 4, 1801   
4   Thomas Jefferson  Second Inaugural Address     Monday, March 4, 1805   

                                                text  \
0  Fellow-Citizens of the Senate and of the House...   
1  Fellow Citizens:  I AM again called upon by th...   
2  WHEN it was first perceived, in early times, t...   
3  Friends and Fellow-Citizens:  CALLED upon to u...   
4  PROCEEDING, fellow-citizens, to that qualifica...   

                                          text_clean  char_cnt  word_cnt  \
0  fellow-citizens of the senate and of the house...      8616      1427   
1  fellow citizens:  i am again called upon by th...  