In [1]:
import pandas as pd
import numpy as np

%config Completer.use_jedi = False

In [2]:
df = pd.read_csv('./input_data/inaugural_speeches.csv')
df.head()

Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


### Cleansing data

In [3]:
df['clean_text'] = df.text.str.replace('[^a-zA-Z]',' ')

In [4]:
df['clean_text'] = df.text.str.lower().str.replace('\s+',' ')

### Aggregate hight level feature from text

In [5]:
df['char_count'] = df.clean_text.str.len()

In [6]:
df['word_count'] = df.clean_text.str.split().str.len()

In [7]:
df['avg_word_len'] = df['char_count']/df['word_count'] 

In [8]:
df.head()

Unnamed: 0,Name,Inaugural Address,Date,text,clean_text,char_count,word_count,avg_word_len
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...,fellow-citizens of the senate and of the house...,8616,1427,6.037842
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...,fellow citizens: i am again called upon by th...,787,135,5.82963
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t...","when it was first perceived, in early times, t...",13871,2317,5.986621
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...,friends and fellow-citizens: called upon to u...,10144,1717,5.907979
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica...","proceeding, fellow-citizens, to that qualifica...",12902,2157,5.981456


In [9]:
df.clean_text[0]

'fellow-citizens of the senate and of the house of representatives:  among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the   th day of the present month. on the one hand, i was summoned by my country, whose voice i can never hear but with veneration and love, from a retreat which i had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years<u+0097>a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. on the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could n

In [10]:
df.text[0]

'Fellow-Citizens of the Senate and of the House of Representatives:  AMONG the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the   th day of the present month. On the one hand, I was summoned by my country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years<U+0097>a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could n

### CountVectorize

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
vectorizer = CountVectorizer(max_features=300)

In [48]:
vectorizer.fit(df['clean_text'])

In [49]:
vectorizer.get_feature_names_out()

array(['0092', '0097', 'act', 'action', 'administration', 'again',
       'against', 'all', 'also', 'always', 'am', 'america', 'american',
       'americans', 'among', 'an', 'and', 'another', 'any', 'are', 'as',
       'at', 'authority', 'be', 'because', 'become', 'been', 'before',
       'being', 'believe', 'best', 'better', 'between', 'beyond', 'both',
       'business', 'but', 'by', 'called', 'can', 'cannot', 'cause',
       'century', 'change', 'character', 'children', 'citizen',
       'citizens', 'civil', 'come', 'commerce', 'common', 'confidence',
       'congress', 'constitution', 'constitutional', 'continue', 'could',
       'country', 'countrymen', 'course', 'day', 'democracy', 'do',
       'done', 'duties', 'duty', 'each', 'earth', 'economy', 'equal',
       'even', 'ever', 'every', 'executive', 'experience', 'faith', 'far',
       'federal', 'fellow', 'find', 'first', 'for', 'force', 'foreign',
       'found', 'free', 'freedom', 'from', 'full', 'future', 'general',
       '

In [50]:
vectorizer.get_feature_names_out()

array(['0092', '0097', 'act', 'action', 'administration', 'again',
       'against', 'all', 'also', 'always', 'am', 'america', 'american',
       'americans', 'among', 'an', 'and', 'another', 'any', 'are', 'as',
       'at', 'authority', 'be', 'because', 'become', 'been', 'before',
       'being', 'believe', 'best', 'better', 'between', 'beyond', 'both',
       'business', 'but', 'by', 'called', 'can', 'cannot', 'cause',
       'century', 'change', 'character', 'children', 'citizen',
       'citizens', 'civil', 'come', 'commerce', 'common', 'confidence',
       'congress', 'constitution', 'constitutional', 'continue', 'could',
       'country', 'countrymen', 'course', 'day', 'democracy', 'do',
       'done', 'duties', 'duty', 'each', 'earth', 'economy', 'equal',
       'even', 'ever', 'every', 'executive', 'experience', 'faith', 'far',
       'federal', 'fellow', 'find', 'first', 'for', 'force', 'foreign',
       'found', 'free', 'freedom', 'from', 'full', 'future', 'general',
       '

In [51]:
vectorized_data = vectorizer.transform(df['clean_text']).toarray()

In [52]:
pd.DataFrame(vectorized_data, columns=vectorizer.get_feature_names_out()).add_prefix('V_').shape

(58, 300)

In [53]:
corpus = ['Hôm qua tôi đi học ở trường',
         'Trường học tôi rất xa nhà',
         'Nhà tôi ở đầu làng',
         'Mẹ tôi đi chợ rất xa']

In [54]:
vectorizer=CountVectorizer(vocabulary=['tôi','mẹ','trường','làng','chợ'])

In [55]:
vectorizer.fit_transform(corpus).toarray()

array([[1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 1, 0, 0, 1]], dtype=int64)

In [56]:
pd.DataFrame(vectorizer.fit_transform(corpus).toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,tôi,mẹ,trường,làng,chợ
0,1,0,1,0,0
1,1,0,1,0,0
2,1,0,0,1,0
3,1,1,0,0,1


### Tf-IDF Vectorizer

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

In [59]:
vectorized_data = vectorizer.fit_transform(df['clean_text']).toarray()

In [60]:
vectorizer.get_feature_names_out()

array(['0092', '0097', 'action', 'administration', 'america', 'american',
       'americans', 'believe', 'best', 'better', 'citizens', 'come',
       'common', 'confidence', 'congress', 'constitution', 'country',
       'day', 'duties', 'duty', 'equal', 'executive', 'faith', 'far',
       'federal', 'fellow', 'force', 'foreign', 'free', 'freedom',
       'future', 'general', 'god', 'good', 'government', 'great', 'high',
       'history', 'home', 'hope', 'human', 'institutions', 'interests',
       'just', 'justice', 'know', 'land', 'law', 'laws', 'let', 'liberty',
       'life', 'long', 'make', 'man', 'means', 'men', 'nation',
       'national', 'nations', 'necessary', 'need', 'new', 'office', 'old',
       'order', 'party', 'peace', 'people', 'place', 'policy',
       'political', 'power', 'powers', 'present', 'president',
       'principles', 'progress', 'prosperity', 'public', 'purpose',
       'right', 'rights', 'service', 'shall', 'spirit', 'state', 'states',
       'strength', 's

In [61]:
vectorized_df = pd.DataFrame(vectorized_data, columns=vectorizer.get_feature_names_out()).add_prefix('V_')

In [84]:
vectorizer_df.iloc[0].sort_values(ascending=False)

V_power       116
V_make         71
V_right        48
V_american     48
V_today        36
             ... 
V_need          0
V_action        0
V_home          0
V_duties        0
V_high          0
Name: 0, Length: 100, dtype: int64

### CountVectorize vs Tf-IDF Vectorizer

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
vectorizer = CountVectorizer(max_features=100, stop_words='english')

In [65]:
vectorized_data = vectorizer.fit_transform(df['clean_text']).toarray()

In [66]:
from sklearn.feature_extraction.text import TfidfTransformer

In [67]:
transfomer = TfidfTransformer()

In [68]:
transfomer.fit_transform(vectorized_data).toarray()

array([[0.        , 0.04746758, 0.        , ..., 0.        , 0.04587701,
        0.05263464],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02195458, 0.        , ..., 0.        , 0.06365675,
        0.07303331],
       ...,
       [0.59501644, 0.41016363, 0.0260232 , ..., 0.127289  , 0.12064947,
        0.01977443],
       [0.61169903, 0.40333027, 0.02675281, ..., 0.08723855, 0.05315663,
        0.10164424],
       [0.54527906, 0.13727721, 0.05723501, ..., 0.02332978, 0.11372337,
        0.04349157]])

In [70]:
# compare with vectorized_df
vectorized_df.head()

Unnamed: 0,V_0092,V_0097,V_action,V_administration,V_america,V_american,V_americans,V_believe,V_best,V_better,...,V_things,V_time,V_today,V_union,V_united,V_war,V_way,V_work,V_world,V_years
0,0.0,0.047468,0.0,0.133265,0.0,0.105269,0.0,0.0,0.0,0.0,...,0.0,0.045877,0.0,0.135859,0.203364,0.0,0.060687,0.0,0.045877,0.052635
1,0.0,0.0,0.0,0.261016,0.266097,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.199157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.021955,0.0,0.092456,0.157092,0.073033,0.0,0.0,0.026118,0.060473,...,0.032037,0.021219,0.0,0.062837,0.070544,0.024344,0.0,0.0,0.063657,0.073033
3,0.0,0.131111,0.0,0.092023,0.0,0.0,0.0,0.090286,0.11698,0.045143,...,0.047831,0.0,0.0,0.093814,0.0,0.036346,0.0,0.038993,0.095038,0.0
4,0.0,0.028455,0.041523,0.039943,0.0,0.031552,0.0,0.0,0.067701,0.039189,...,0.083046,0.165008,0.0,0.122162,0.030477,0.094657,0.0,0.0,0.055003,0.063105


### N-gram

In [75]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [76]:
vectorized_data = vectorizer.fit_transform(corpus).toarray()

In [77]:
vectorizer.get_feature_names_out()

array(['chợ', 'chợ rất', 'hôm', 'hôm qua', 'học', 'học trường', 'học tôi',
       'làng', 'mẹ', 'mẹ tôi', 'nhà', 'nhà tôi', 'qua', 'qua tôi', 'rất',
       'rất xa', 'trường', 'trường học', 'tôi', 'tôi rất', 'tôi đi',
       'tôi đầu', 'xa', 'xa nhà', 'đi', 'đi chợ', 'đi học', 'đầu',
       'đầu làng'], dtype=object)

In [78]:
vectorized_df = pd.DataFrame(vectorized_data,columns=vectorizer.get_feature_names_out()).add_prefix('V_')

In [79]:
vectorized_df

Unnamed: 0,V_chợ,V_chợ rất,V_hôm,V_hôm qua,V_học,V_học trường,V_học tôi,V_làng,V_mẹ,V_mẹ tôi,...,V_tôi rất,V_tôi đi,V_tôi đầu,V_xa,V_xa nhà,V_đi,V_đi chợ,V_đi học,V_đầu,V_đầu làng
0,0.0,0.0,0.337894,0.337894,0.266399,0.337894,0.0,0.0,0.0,0.0,...,0.0,0.266399,0.0,0.0,0.0,0.266399,0.0,0.337894,0.0,0.0
1,0.0,0.0,0.0,0.0,0.278713,0.0,0.353512,0.0,0.0,0.0,...,0.353512,0.0,0.0,0.278713,0.353512,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411906,0.0,0.0,...,0.0,0.0,0.411906,0.0,0.0,0.0,0.0,0.0,0.411906,0.411906
3,0.345438,0.345438,0.0,0.0,0.0,0.0,0.0,0.0,0.345438,0.345438,...,0.0,0.272348,0.0,0.272348,0.0,0.272348,0.345438,0.0,0.0,0.0


In [85]:
vectorized_df.iloc[0].sort_values(ascending=False)

V_qua           0.337894
V_hôm           0.337894
V_hôm qua       0.337894
V_học trường    0.337894
V_đi học        0.337894
V_qua tôi       0.337894
V_đi            0.266399
V_học           0.266399
V_trường        0.266399
V_tôi đi        0.266399
V_tôi           0.176327
V_tôi đầu       0.000000
V_chợ           0.000000
V_xa            0.000000
V_xa nhà        0.000000
V_đi chợ        0.000000
V_đầu           0.000000
V_tôi rất       0.000000
V_rất           0.000000
V_trường học    0.000000
V_rất xa        0.000000
V_chợ rất       0.000000
V_nhà tôi       0.000000
V_nhà           0.000000
V_mẹ tôi        0.000000
V_mẹ            0.000000
V_làng          0.000000
V_học tôi       0.000000
V_đầu làng      0.000000
Name: 0, dtype: float64