In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

speech_df = pd.read_csv("https://assets.datacamp.com/production/repositories/3752/datasets/cdc15798dd6698003ee33c6af185242faf896187/inaugural_speeches.csv")

In [9]:
# speech_df.head()

# Replace all non letter characters with a whitespace
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

# Change to lower case
speech_df['text_clean'] = speech_df['text_clean'].str.lower()

# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features = 100,
                    stop_words = 'english')

# Fit the vectroizer and transform the data
tv_transformed = tv.fit_transform(speech_df['text_clean'])

# Create a DataFrame with these features
tv_df = pd.DataFrame(tv_transformed.toarray(), 
                     columns=tv.get_feature_names_out()).add_prefix('TFIDF_')
print(tv_df.head())

   TFIDF_0092  TFIDF_0097  TFIDF_action  TFIDF_administration  TFIDF_america  \
0         0.0    0.047468      0.000000              0.133265       0.000000   
1         0.0    0.000000      0.000000              0.261016       0.266097   
2         0.0    0.021955      0.000000              0.092456       0.157092   
3         0.0    0.131111      0.000000              0.092023       0.000000   
4         0.0    0.028455      0.041523              0.039943       0.000000   

   TFIDF_american  TFIDF_americans  TFIDF_believe  TFIDF_best  TFIDF_better  \
0        0.105269              0.0       0.000000    0.000000      0.000000   
1        0.000000              0.0       0.000000    0.000000      0.000000   
2        0.073033              0.0       0.000000    0.026118      0.060473   
3        0.000000              0.0       0.090286    0.116980      0.045143   
4        0.031552              0.0       0.000000    0.067701      0.039189   

   ...  TFIDF_things  TFIDF_time  TFIDF_toda

In [11]:
# Isolate the row to be examined
sample_row = tv_df.iloc[0]

# Print the top 5 words of the sorted output
print(sample_row.sort_values(ascending=False).head())

TFIDF_government    0.367016
TFIDF_public        0.332862
TFIDF_present       0.314827
TFIDF_duty          0.238368
TFIDF_country       0.229385
Name: 0, dtype: float64


In [19]:
# transforming unseen data
from sklearn.model_selection import train_test_split

# split dataset
train_speech_df, test_speech_df = train_test_split(speech_df, test_size = 0.3)

# Instantiate TfidfVectorizer
tv = TfidfVectorizer(max_features=100, stop_words='english')

# Fit the vectroizer and transform the data
tv_transformed = tv.fit_transform(train_speech_df['text_clean'])

# Transform test data
test_tv_transformed = tv.transform(test_speech_df['text_clean'])

# Create new features for the test set
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(), 
                          columns=tv.get_feature_names_out()).add_prefix('TFIDF_')
print(test_tv_df.head())

   TFIDF_0092  TFIDF_0097  TFIDF_administration  TFIDF_america  \
0         0.0    0.000000              0.000000       0.000000   
1         0.0    0.014693              0.147716       0.084696   
2         0.0    0.160797              0.000000       0.231716   
3         0.0    0.035171              0.110494       0.380124   
4         0.0    0.251716              0.000000       0.077729   

   TFIDF_american  TFIDF_americans  TFIDF_believe  TFIDF_best  TFIDF_better  \
0        0.000000         0.000000       0.000000    0.039514      0.000000   
1        0.113632         0.028273       0.038960    0.052573      0.061751   
2        0.050756         0.088402       0.000000    0.027397      0.160898   
3        0.155426         0.033838       0.023314    0.083895      0.049271   
4        0.019864         0.069194       0.000000    0.000000      0.000000   

   TFIDF_century  ...  TFIDF_strength  TFIDF_support  TFIDF_time  TFIDF_today  \
0       0.000000  ...        0.047743       0.0