In [3]:
import numpy as np
import pandas as pd

speech_df = pd.read_csv("https://assets.datacamp.com/production/repositories/3752/datasets/cdc15798dd6698003ee33c6af185242faf896187/inaugural_speeches.csv")


In [5]:
# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()


# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate a trigram vectorizer
cv_trigram_vec = CountVectorizer(max_features=100, 
                                 stop_words='english', 
                                 ngram_range = (3, 3))

# Fit and apply trigram vectorizer
cv_trigram = cv_trigram_vec.fit_transform(speech_df['text_clean'])

# Print the trigram features
print(cv_trigram_vec.get_feature_names_out())

['0092 ideal freedom' 'ability preserve protect'
 'agriculture commerce manufactures' 'america 0092 ideal'
 'best ability preserve' 'best interests country' 'bless god bless'
 'bless united states' 'chief justice mr' 'children children children'
 'citizens united states' 'civil religious liberty'
 'commerce united states' 'concern thank god' 'confidence fellow citizens'
 'congress extraordinary session' 'constitution does expressly'
 'constitution united states' 'coordinate branches government'
 'day task people' 'defend constitution united' 'desire determined work'
 'distinguished guests fellow' 'does expressly say' 'equal exact justice'
 'era good feeling' 'executive branch government'
 'faithfully execute office' 'fellow citizens assembled'
 'fellow citizens called' 'fellow citizens large' 'fellow citizens world'
 'form perfect union' 'general welfare secure' 'god bless america'
 'god bless god' 'good greatest number' 'government united states'
 'granted federal government' 'great b

In [7]:
# Create a DataFrame of the features
cv_tri_df = pd.DataFrame(cv_trigram.toarray(), 
                 columns=cv_trigram_vec.get_feature_names_out()).add_prefix('Counts_')

# Print the top 5 words in the sorted output
print(cv_tri_df.sum().sort_values(ascending=False).head())

Counts_constitution united states    20
Counts_people united states          13
Counts_mr chief justice              10
Counts_preserve protect defend       10
Counts_president united states        8
dtype: int64
