In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Inaugural speeches of the presidents of the US

In [2]:
speech_df = pd.read_csv('./inaugural_speeches.csv')
print(speech_df.shape)
speech_df.head()

(58, 4)


Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica..."


## Cleaning up your text

In [3]:
# replace all non letter characters with a whitespace
# then change to lower case
speech_df['text_clean'] = speech_df['text'].str.replace(
    '[^a-zA-Z]', ' ').str.lower()

speech_df['text_clean'].head()

0    fellow citizens of the senate and of the house...
1    fellow citizens   i am again called upon by th...
2    when it was first perceived  in early times  t...
3    friends and fellow citizens   called upon to u...
4    proceeding  fellow citizens  to that qualifica...
Name: text_clean, dtype: object

## High level text features

In [7]:
# find the length of each text
speech_df['char_cnt'] = speech_df['text_clean'].str.len()

# count the number of words in each text
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

# find the average length of words
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']].head()

Unnamed: 0,text_clean,char_cnt,word_cnt,avg_word_length
0,fellow citizens of the senate and of the house...,8616,1432,6.01676
1,fellow citizens i am again called upon by th...,787,135,5.82963
2,when it was first perceived in early times t...,13871,2323,5.971158
3,friends and fellow citizens called upon to u...,10144,1736,5.843318
4,proceeding fellow citizens to that qualifica...,12902,2169,5.948363


## Counting words (I)

In [9]:
# Instantiate CountVectorizer
cv = CountVectorizer()

# fit
cv.fit(speech_df['text_clean'])

# print feature names
print(cv.get_feature_names())

['abandon', 'abandoned', 'abandonment', 'abate', 'abdicated', 'abeyance', 'abhorring', 'abide', 'abiding', 'abilities', 'ability', 'abject', 'able', 'ably', 'abnormal', 'abode', 'abolish', 'abolished', 'abolishing', 'aboriginal', 'aborigines', 'abound', 'abounding', 'abounds', 'about', 'above', 'abraham', 'abreast', 'abridging', 'abroad', 'absence', 'absent', 'absolute', 'absolutely', 'absolutism', 'absorb', 'absorbed', 'absorbing', 'absorbs', 'abstain', 'abstaining', 'abstract', 'abstractions', 'absurd', 'abundance', 'abundant', 'abundantly', 'abuse', 'abused', 'abuses', 'academies', 'accept', 'acceptance', 'accepted', 'accepting', 'accepts', 'access', 'accessible', 'accession', 'accident', 'accidental', 'accidents', 'acclaim', 'accommodation', 'accommodations', 'accompanied', 'accompany', 'accomplish', 'accomplished', 'accomplishing', 'accomplishment', 'accomplishments', 'accord', 'accordance', 'accorded', 'according', 'accordingly', 'accords', 'account', 'accountability', 'accountab

In [10]:
# apply vectorizer
cv_transformed = cv.transform(speech_df['text_clean'])

# full array
cv_array = cv_transformed.toarray()

print(cv_array.shape)

(58, 9043)


## Limiting your features

In [11]:
# specify arguments to limit the number of features generated
cv = CountVectorizer(min_df=0.2,
                     max_df=0.8)

# fit and transform
cv_transformed = cv.fit_transform(speech_df['text_clean'])
cv_array = cv_transformed.toarray()

print(cv_array.shape)

(58, 818)


## Text to DataFrame

In [14]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [15]:
# create a dataframe with these features
cv_df = pd.DataFrame(cv_array,
                     columns=cv.get_feature_names()).add_prefix('Counts_')

# add new cols to the original df
speech_df_new = pd.concat([speech_df, cv_df], 
                          axis=1,
                          sort=False)
speech_df_new.head()

Unnamed: 0,Name,Inaugural Address,Date,text,text_clean,char_cnt,word_cnt,avg_word_length,Counts_abiding,Counts_ability,Counts_able,Counts_about,Counts_above,Counts_abroad,Counts_accept,Counts_accomplished,Counts_achieve,Counts_across,Counts_act,Counts_action,Counts_acts,Counts_add,Counts_adequate,Counts_administration,Counts_adopted,Counts_advance,Counts_advantage,Counts_affairs,Counts_afford,Counts_after,Counts_again,Counts_against,Counts_age,Counts_ago,Counts_agriculture,Counts_aid,Counts_alike,Counts_almighty,Counts_almost,Counts_alone,Counts_along,Counts_already,Counts_also,Counts_always,Counts_am,Counts_america,Counts_american,Counts_americans,Counts_among,Counts_ancient,Counts_another,Counts_appear,Counts_armed,Counts_arms,Counts_around,Counts_ask,Counts_assume,Counts_attempt,Counts_attention,Counts_authority,Counts_avoid,Counts_away,Counts_back,Counts_balance,Counts_basis,Counts_bear,Counts_because,Counts_become,Counts_beginning,Counts_being,Counts_belief,Counts_believe,Counts_believed,Counts_belongs,Counts_benefit,Counts_benefits,Counts_best,Counts_better,Counts_between,Counts_beyond,Counts_birth,Counts_bless,Counts_blessings,Counts_blood,Counts_body,Counts_bonds,Counts_born,Counts_both,Counts_bound,Counts_branches,Counts_bring,Counts_brought,Counts_build,Counts_burden,Counts_burdens,Counts_business,Counts_call,Counts_called,Counts_came,Counts_cannot,Counts_capacity,Counts_capital,Counts_care,Counts_carry,Counts_cause,Counts_centuries,Counts_century,Counts_certain,Counts_challenge,Counts_chance,Counts_change,Counts_changes,Counts_character,Counts_cherish,Counts_chief,Counts_children,Counts_choice,Counts_chosen,Counts_circumstances,Counts_citizen,Counts_citizenship,Counts_civil,Counts_civilization,Counts_claim,Counts_class,Counts_clear,Counts_clearly,Counts_close,Counts_come,Counts_command,Counts_commerce,Counts_committed,Counts_common,Counts_communities,Counts_community,Counts_complete,Counts_concern,Counts_concerns,Counts_condition,Counts_conditions,Counts_conduct,Counts_confidence,Counts_conflict,Counts_congress,Counts_conscience,Counts_consequences,Counts_consider,Counts_consideration,Counts_considered,Counts_constant,Counts_constantly,Counts_constitution,Counts_constitutional,Counts_continent,Counts_continue,Counts_continued,Counts_control,Counts_conviction,Counts_cooperation,Counts_cost,Counts_could,Counts_countries,Counts_countrymen,Counts_courage,Counts_course,Counts_create,Counts_created,Counts_credit,Counts_crisis,Counts_danger,Counts_dangers,Counts_day,Counts_days,Counts_debt,Counts_decision,Counts_decisions,Counts_declaration,Counts_declare,Counts_declared,Counts_deep,Counts_deeply,Counts_defend,Counts_defense,Counts_degree,Counts_demand,Counts_demands,Counts_democracy,Counts_departments,Counts_depend,Counts_desire,Counts_destiny,Counts_destroy,Counts_destruction,Counts_determination,Counts_determined,Counts_development,Counts_devotion,Counts_did,Counts_difference,Counts_differences,Counts_different,Counts_difficult,Counts_difficulties,Counts_dignity,Counts_direct,Counts_direction,Counts_discharge,Counts_disposition,Counts_distant,Counts_distinguished,Counts_divine,Counts_does,Counts_doing,Counts_domestic,Counts_done,Counts_doubt,Counts_down,Counts_due,Counts_during,Counts_duties,Counts_duty,Counts_each,Counts_earth,Counts_economic,Counts_economy,Counts_education,Counts_effect,Counts_effective,Counts_effort,Counts_efforts,Counts_either,Counts_election,Counts_employed,Counts_encourage,Counts_encouragement,Counts_end,Counts_endeavor,Counts_enemies,Counts_energy,Counts_engaged,Counts_enjoy,Counts_enjoyed,Counts_enlightened,Counts_enough,Counts_enter,Counts_enterprise,Counts_equal,Counts_equality,Counts_equally,Counts_especially,Counts_essential,Counts_establish,Counts_established,Counts_establishment,Counts_even,Counts_events,Counts_ever,Counts_everywhere,Counts_evil,Counts_example,Counts_except,Counts_execute,Counts_executive,Counts_exercise,Counts_existence,Counts_expect,Counts_expected,Counts_expenditures,Counts_experience,Counts_experiment,Counts_express,Counts_expression,Counts_extend,Counts_extended,Counts_extent,Counts_eyes,Counts_face,Counts_fact,Counts_fail,Counts_failure,Counts_fair,Counts_faith,Counts_faithful,Counts_family,Counts_far,Counts_fathers,Counts_favor,Counts_fear,Counts_federal,Counts_feel,Counts_felt,Counts_few,Counts_finally,Counts_find,Counts_firm,Counts_first,Counts_fixed,Counts_follow,Counts_followed,Counts_force,Counts_forces,Counts_foreign,Counts_forever,Counts_form,Counts_formed,Counts_forms,Counts_forth,Counts_forward,Counts_found,Counts_foundations,Counts_founded,Counts_four,Counts_freedom,Counts_friends,Counts_friendship,Counts_full,Counts_fully,Counts_functions,Counts_further,Counts_general,Counts_generation,Counts_generations,Counts_generous,Counts_genius,Counts_give,Counts_given,Counts_go,Counts_god,Counts_govern,Counts_governments,Counts_granted,Counts_grateful,Counts_gratitude,Counts_greater,Counts_greatest,Counts_greatly,Counts_growth,Counts_guidance,Counts_had,Counts_half,Counts_hand,Counts_hands,Counts_happiness,Counts_happy,Counts_hard,Counts_harmony,Counts_having,Counts_he,Counts_health,Counts_heart,Counts_hearts,Counts_held,Counts_help,Counts_her,Counts_here,Counts_high,Counts_highest,Counts_him,Counts_himself,Counts_history,Counts_hold,Counts_home,Counts_honest,Counts_honor,Counts_honorable,Counts_hope,Counts_hopes,Counts_how,Counts_however,Counts_human,Counts_humanity,Counts_humble,Counts_ideals,Counts_immediate,Counts_importance,Counts_important,Counts_impossible,Counts_improvement,Counts_increase,Counts_increasing,Counts_indeed,Counts_independence,Counts_independent,Counts_individual,Counts_individuals,Counts_industrial,Counts_industries,Counts_industry,Counts_influence,Counts_injustice,Counts_instead,Counts_institutions,Counts_instrument,Counts_integrity,Counts_intelligence,Counts_intercourse,Counts_interest,Counts_interests,Counts_internal,Counts_international,Counts_into,Counts_itself,Counts_join,Counts_judgment,Counts_justice,Counts_justly,Counts_keep,Counts_know,Counts_knowledge,Counts_known,Counts_labor,Counts_land,Counts_lands,Counts_large,Counts_last,Counts_lasting,Counts_law,Counts_laws,Counts_lead,Counts_leading,Counts_least,Counts_leave,Counts_led,Counts_left,Counts_legislation,Counts_legislative,Counts_legitimate,Counts_less,Counts_let,Counts_liberty,Counts_lies,Counts_light,Counts_like,Counts_limitations,Counts_limited,Counts_limits,Counts_lines,Counts_little,Counts_live,Counts_lives,Counts_living,Counts_local,Counts_long,Counts_longer,Counts_look,Counts_lost,Counts_love,Counts_made,Counts_maintain,Counts_maintained,Counts_maintenance,Counts_majority,Counts_make,Counts_makes,Counts_making,Counts_man,Counts_mankind,Counts_manner,Counts_many,Counts_material,Counts_matters,Counts_me,Counts_means,Counts_measure,Counts_measures,Counts_meet,Counts_members,Counts_mere,Counts_merely,Counts_might,Counts_military,Counts_millions,Counts_mind,Counts_minds,Counts_modern,Counts_moment,Counts_money,Counts_moral,Counts_move,Counts_mr,Counts_much,Counts_mutual,Counts_myself,Counts_national,Counts_natural,Counts_nature,Counts_navy,Counts_nearly,Counts_necessary,Counts_necessity,Counts_need,Counts_needed,Counts_needs,Counts_neighbors,Counts_neither,Counts_never,Counts_none,Counts_nor,Counts_north,Counts_nothing,Counts_number,Counts_oath,Counts_object,Counts_objects,Counts_obligation,Counts_obligations,Counts_occasion,Counts_ocean,Counts_off,Counts_offer,Counts_office,Counts_officers,Counts_official,Counts_often,Counts_old,Counts_once,Counts_open,Counts_operation,Counts_opinion,Counts_opinions,Counts_opportunity,Counts_order,Counts_organization,Counts_others,Counts_ought,Counts_ours,Counts_ourselves,Counts_out,Counts_over,Counts_part,Counts_parties,Counts_parts,Counts_party,Counts_pass,Counts_passed,Counts_passion,Counts_past,Counts_path,Counts_patriotic,Counts_patriotism,Counts_pay,Counts_peace,Counts_peaceful,Counts_peculiar,Counts_peoples,Counts_perfect,Counts_perform,Counts_perhaps,Counts_period,Counts_permanent,Counts_person,Counts_personal,Counts_place,Counts_placed,Counts_places,Counts_pledge,Counts_pledged,Counts_point,Counts_policy,Counts_political,Counts_popular,Counts_population,Counts_portion,Counts_position,Counts_possible,Counts_poverty,Counts_powerful,Counts_powers,Counts_practical,Counts_practice,Counts_precious,Counts_prejudice,Counts_presence,Counts_present,Counts_preservation,Counts_preserve,Counts_preserved,Counts_president,Counts_prevent,Counts_price,Counts_pride,Counts_principle,Counts_principles,Counts_private,Counts_problems,Counts_produce,Counts_productive,Counts_products,Counts_progress,Counts_promise,Counts_promote,Counts_proper,Counts_property,Counts_prosperity,Counts_prosperous,Counts_protect,Counts_protected,Counts_protection,Counts_proud,Counts_proved,Counts_provide,Counts_provided,Counts_providence,Counts_provision,Counts_public,Counts_purpose,Counts_purposes,Counts_pursue,Counts_pursued,Counts_put,Counts_question,Counts_questions,Counts_race,Counts_rather,Counts_reach,Counts_ready,Counts_real,Counts_realize,Counts_reason,Counts_reasonable,Counts_receive,Counts_recent,Counts_recognize,Counts_reform,Counts_regard,Counts_relations,Counts_religion,Counts_religious,Counts_remain,Counts_remains,Counts_remember,Counts_renew,Counts_renewed,Counts_representatives,Counts_republic,Counts_republican,Counts_require,Counts_required,Counts_requires,Counts_resolve,Counts_resources,Counts_respect,Counts_responsibilities,Counts_responsibility,Counts_responsible,Counts_rest,Counts_restore,Counts_rests,Counts_result,Counts_results,Counts_return,Counts_revenue,Counts_revolution,Counts_rewards,Counts_rich,Counts_right,Counts_rule,Counts_rules,Counts_sacred,Counts_sacrifice,Counts_safe,Counts_safety,Counts_said,Counts_same,Counts_satisfaction,Counts_say,Counts_schools,Counts_science,Counts_second,Counts_sections,Counts_secure,Counts_secured,Counts_security,Counts_see,Counts_seek,Counts_seeking,Counts_seem,Counts_seen,Counts_self,Counts_sense,Counts_sentiment,Counts_serious,Counts_serve,Counts_service,Counts_set,Counts_settled,Counts_several,Counts_share,Counts_short,Counts_should,Counts_show,Counts_shown,Counts_side,Counts_simple,Counts_since,Counts_single,Counts_small,Counts_social,Counts_society,Counts_soil,Counts_solemn,Counts_some,Counts_something,Counts_sometimes,Counts_soon,Counts_sound,Counts_source,Counts_south,Counts_sovereignty,Counts_speak,Counts_special,Counts_spirit,Counts_stand,Counts_standing,Counts_state,Counts_states,Counts_station,Counts_still,Counts_strength,Counts_strengthen,Counts_strengthened,Counts_strict,Counts_strong,Counts_stronger,Counts_struggle,Counts_subject,Counts_subjects,Counts_success,Counts_such,Counts_suffer,Counts_suffering,Counts_sufficient,Counts_support,Counts_supreme,Counts_sure,Counts_surely,Counts_sustained,Counts_system,Counts_take,Counts_taken,Counts_taking,Counts_task,Counts_taxes,Counts_terms,Counts_territory,Counts_themselves,Counts_then,Counts_therefore,Counts_things,Counts_think,Counts_though,Counts_thought,Counts_through,Counts_throughout,Counts_thus,Counts_times,Counts_today,Counts_together,Counts_too,Counts_toward,Counts_trade,Counts_treaty,Counts_true,Counts_trust,Counts_truth,Counts_turn,Counts_two,Counts_under,Counts_understand,Counts_understanding,Counts_union,Counts_unity,Counts_until,Counts_up,Counts_use,Counts_useful,Counts_value,Counts_values,Counts_various,Counts_vast,Counts_very,Counts_vice,Counts_view,Counts_views,Counts_virtue,Counts_vision,Counts_vital,Counts_voice,Counts_want,Counts_war,Counts_wars,Counts_washington,Counts_waste,Counts_way,Counts_ways,Counts_weak,Counts_wealth,Counts_weight,Counts_welfare,Counts_were,Counts_what,Counts_whatever,Counts_where,Counts_wherever,Counts_whether,Counts_while,Counts_whole,Counts_whom,Counts_whose,Counts_willing,Counts_wisdom,Counts_wise,Counts_wisely,Counts_wish,Counts_within,Counts_without,Counts_women,Counts_words,Counts_work,Counts_wrong,Counts_year,Counts_years,Counts_yet,Counts_you,Counts_young,Counts_your
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House...,fellow citizens of the senate and of the house...,8616,1432,6.01676,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,2,0,0,1,1,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,2,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,3,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2,0,0,1,0,0,0,0,0,0,0,1,0,1,0,2,0,0,0,1,0,0,0,2,1,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,3,0,0,0,1,0,0,0,1,0,0,2,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1,1,4,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,2,0,0,0,0,0,0,0,0,1,2,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,2,0,0,0,1,0,1,1,1,0,0,0,0,1,0,3,1,3,0,0,1,1,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,1,2,0,2,0,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,2,0,1,0,1,0,0,0,0,1,0,0,3,1,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,8,1,0,3,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,1,0,4,1,0,3,0,0,2,0,0,0,0,0,0,2,1,2,0,0,0,0,0,2,0,0,1,0,0,0,0,0,2,0,0,1,0,0,0,0,0,2,0,0,4,0,0,1,2,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,1,1,0,0,0,0,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,5,1,0,0,1,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,6,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,2,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2,0,1,0,0,1,0,4,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,0,0,0,0,0,0,1,2,0,2,3,0,0,1,0,0,0,0,0,2,1,0,0,1,0,0,0,2,1,1,0,0,0,1,0,0,1,0,0,1,3,0,0,0,0,2,1,0,0,5,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,2,0,0,0,0,0,1,0,5,0,9
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by th...,fellow citizens i am again called upon by th...,787,135,5.82963,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, t...",when it was first perceived in early times t...,13871,2323,5.971158,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,3,1,0,1,0,0,3,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,5,3,0,4,1,0,1,0,0,0,0,0,0,2,3,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,2,2,0,0,0,1,0,1,0,0,3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0,0,1,0,1,0,0,1,0,0,0,4,0,0,1,0,0,0,0,0,0,0,0,0,0,3,1,2,0,0,0,0,0,1,0,0,1,0,6,0,1,2,2,1,1,0,8,0,0,1,0,0,1,0,0,1,0,0,0,4,0,0,1,1,2,0,1,0,0,0,0,0,0,0,0,1,0,2,2,1,0,0,0,1,1,0,0,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,2,1,2,0,0,0,0,0,1,0,1,0,0,1,1,1,0,0,2,2,1,0,1,0,1,0,0,0,2,0,0,0,1,2,1,0,0,0,6,0,0,2,0,1,2,0,1,0,1,0,2,1,1,0,0,0,0,0,0,0,0,0,1,3,0,0,1,0,0,0,0,2,1,0,0,0,0,3,1,0,0,0,0,8,0,3,2,2,0,0,0,0,1,0,0,1,2,1,0,0,1,4,0,0,0,1,1,0,1,0,2,2,0,0,2,0,0,0,0,0,7,1,0,0,3,2,0,0,0,2,0,0,2,0,0,0,0,1,1,0,1,1,0,0,1,7,0,1,0,0,1,2,2,1,0,0,0,0,0,0,0,2,1,2,1,1,1,0,0,0,2,0,0,1,0,3,1,0,4,0,1,0,2,1,0,0,6,0,0,0,4,0,0,0,1,1,0,0,0,2,0,0,1,0,0,0,0,0,1,1,0,3,0,0,1,0,0,0,0,3,1,1,0,0,3,0,0,0,5,0,2,0,0,1,1,0,0,0,1,0,1,0,0,5,1,0,2,0,0,1,0,1,0,0,6,0,0,0,0,1,0,0,1,0,2,4,1,0,0,0,3,1,0,0,0,1,0,1,0,2,0,0,1,0,1,1,0,2,1,1,0,0,0,0,0,2,0,0,1,1,1,1,0,4,0,0,0,1,2,0,4,0,1,1,4,0,0,0,0,0,0,1,0,5,0,0,0,1,0,0,0,1,0,2,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1,2,1,1,1,0,0,0,1,1,2,2,0,1,0,0,0,0,1,0,0,2,0,0,1,0,0,0,1,0,2,0,6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,0,1,0,0,0,0,2,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,3,2,1,1,1,0,0,1,1,0,1,0,0,0,1,0,2,1,2,0,2,0,0,0,0,0,4,0,0,0,0,1,2,0,0,2,0,1,4,1,0,1,0,0,0,0,1,0,6,0,0,3,9,1,1,0,0,0,0,0,0,0,1,0,2,6,0,0,1,4,1,0,0,0,3,0,0,0,0,0,0,0,2,3,0,1,0,0,1,1,2,0,2,0,0,0,3,0,0,0,0,1,0,1,6,0,0,2,0,2,1,0,0,1,0,1,0,1,0,0,0,2,0,0,1,0,1,0,0,0,0,0,0,1,0,1,5,3,2,1,0,0,1,3,0,0,0,0,1,0,1,0,3,0,0,0,0,2,3,0,0,0,1
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CALLED upon to u...,friends and fellow citizens called upon to u...,10144,1736,5.843318,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,0,2,0,1,0,0,0,2,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,2,0,0,0,1,0,0,0,0,0,2,0,1,0,1,0,2,0,0,0,0,3,1,0,1,1,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,1,2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,3,1,2,0,0,0,1,1,0,0,0,4,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,1,0,2,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,3,2,0,0,1,0,1,0,1,0,0,1,0,1,0,0,2,0,0,0,1,1,0,0,1,3,1,0,4,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,1,1,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,2,2,1,1,1,0,0,1,1,4,0,0,0,1,0,0,0,2,0,2,0,0,1,0,0,0,4,1,1,1,0,1,0,2,1,0,0,0,2,0,1,0,1,1,0,1,1,1,2,0,0,1,1,0,0,0,4,1,0,1,1,1,0,1,0,0,0,0,3,3,0,4,1,2,0,1,4,2,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,0,1,2,0,1,1,1,0,1,0,0,2,2,0,0,0,0,4,0,1,0,0,1,0,1,1,0,0,1,7,3,0,0,0,1,0,0,0,1,0,0,0,0,2,0,2,1,2,0,0,0,0,2,2,0,0,6,1,0,1,0,0,4,0,0,1,1,0,0,0,1,1,0,2,0,0,0,0,0,0,0,2,0,2,0,0,1,0,0,2,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,2,0,0,0,0,4,1,0,1,0,5,1,0,0,0,2,0,0,1,0,0,1,0,1,0,0,0,0,5,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,3,0,0,1,0,0,0,0,1,0,0,0,0,2,0,2,1,0,0,0,0,0,6,3,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,1,1,0,4,0,0,1,0,0,1,0,0,0,2,1,0,0,2,1,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,2,0,1,4,0,2,2,0,1,5,1,1,0,0,0,0,0,0,0,0,0,2,0,1,0,2,0,2,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,1,0,0,0,2,0,2,0,0,0,0,0,1,0,0,1,0,2,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,1,0,0,0,1,6,0,1,2,1,1,4,0,0,0,0,0,2,1,0,0,0,2,1,0,0,4,1,0,2,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,0,0,0,0,0,0,5,1,3,0,0,0,2,1,2,0,2,1,0,1,1,2,0,0,1,2,0,0,2,7,0,7
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualifica...",proceeding fellow citizens to that qualifica...,12902,2169,5.948363,0,0,1,0,0,0,0,0,0,0,3,1,0,1,0,1,0,1,0,3,1,0,2,6,0,0,1,0,0,0,0,2,0,1,1,0,1,0,1,0,7,0,1,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,2,1,0,3,0,0,1,0,1,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,6,0,0,0,0,1,1,0,0,2,1,0,0,3,0,0,0,1,1,0,1,0,0,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,2,3,1,2,4,1,4,4,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,2,1,0,1,0,0,0,0,0,1,2,0,0,0,0,0,1,2,1,0,0,0,0,3,1,0,2,0,1,0,0,1,0,0,1,0,1,1,0,1,2,2,0,1,0,0,0,2,0,2,0,1,0,0,0,4,0,0,0,0,0,0,2,1,0,0,2,1,2,2,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,1,2,0,0,3,0,3,0,0,0,0,0,0,1,0,0,0,0,2,0,1,0,0,1,0,0,0,0,2,1,0,0,0,0,2,0,2,0,1,1,1,1,0,2,0,0,2,2,1,1,0,0,0,0,2,4,4,1,0,1,3,1,2,4,0,1,2,1,0,0,2,0,2,0,0,3,3,1,0,0,0,1,3,0,1,0,2,2,3,0,0,0,0,0,5,0,0,1,0,0,1,0,0,0,0,1,3,2,2,0,0,0,0,0,2,0,0,0,0,1,8,0,0,4,1,0,0,0,2,0,0,2,2,0,0,0,2,0,0,0,1,2,0,0,2,0,0,0,1,3,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,3,0,0,1,0,2,0,0,1,2,0,0,3,2,0,3,0,4,0,0,2,0,2,0,1,0,0,0,0,1,1,1,0,1,1,4,0,0,0,0,0,0,0,0,0,0,2,2,1,0,0,1,0,1,0,2,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,2,1,3,0,0,3,0,0,1,0,0,0,2,0,0,0,0,1,0,0,0,2,1,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,1,1,3,1,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,1,2,3,0,0,0,1,1,0,0,0,0,2,1,2,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,2,1,0,0,1,0,1,2,0,2,6,0,0,0,0,1,0,0,0,1,1,0,2,1,2,0,1,1,0,0,0,0,2,0,0,8,4,1,1,2,1,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,0,0,1,0,2,1,1,2,1,4,2,2,0,0,2,0,1,0,0,0,4,0,0,0,1,0,4,0,0,2,0,2,3,0,0,1,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,3,1,0,0,0,0,0,0,1,0,0,4,0,1,0,3,0,1,2,3,0,2,0,0,1,4,2,0,0,0,0,2,2,2,4,0,4


## Tf-idf

In [18]:
# instantiate
tv = TfidfVectorizer(max_features=100, 
                     stop_words='english')

# fit and transform
tv_transformed = tv.fit_transform(speech_df['text_clean'])

# create a df with these features
tv_df = pd.DataFrame(tv_transformed.toarray(),
                     columns=tv.get_feature_names()).add_prefix('TDIDF_')
tv_df.head()

Unnamed: 0,TDIDF_action,TDIDF_administration,TDIDF_america,TDIDF_american,TDIDF_americans,TDIDF_believe,TDIDF_best,TDIDF_better,TDIDF_change,TDIDF_citizens,TDIDF_come,TDIDF_common,TDIDF_confidence,TDIDF_congress,TDIDF_constitution,TDIDF_country,TDIDF_day,TDIDF_duties,TDIDF_duty,TDIDF_equal,TDIDF_executive,TDIDF_faith,TDIDF_far,TDIDF_federal,TDIDF_fellow,TDIDF_force,TDIDF_foreign,TDIDF_free,TDIDF_freedom,TDIDF_future,TDIDF_general,TDIDF_god,TDIDF_good,TDIDF_government,TDIDF_great,TDIDF_high,TDIDF_history,TDIDF_home,TDIDF_hope,TDIDF_human,TDIDF_institutions,TDIDF_interests,TDIDF_just,TDIDF_justice,TDIDF_know,TDIDF_land,TDIDF_law,TDIDF_laws,TDIDF_let,TDIDF_liberty,TDIDF_life,TDIDF_long,TDIDF_make,TDIDF_man,TDIDF_means,TDIDF_men,TDIDF_nation,TDIDF_national,TDIDF_nations,TDIDF_necessary,TDIDF_need,TDIDF_new,TDIDF_office,TDIDF_old,TDIDF_order,TDIDF_party,TDIDF_peace,TDIDF_people,TDIDF_place,TDIDF_policy,TDIDF_political,TDIDF_power,TDIDF_powers,TDIDF_present,TDIDF_president,TDIDF_principles,TDIDF_progress,TDIDF_prosperity,TDIDF_public,TDIDF_purpose,TDIDF_right,TDIDF_rights,TDIDF_secure,TDIDF_service,TDIDF_shall,TDIDF_spirit,TDIDF_state,TDIDF_states,TDIDF_strength,TDIDF_support,TDIDF_things,TDIDF_time,TDIDF_today,TDIDF_union,TDIDF_united,TDIDF_war,TDIDF_way,TDIDF_work,TDIDF_world,TDIDF_years
0,0.0,0.133415,0.0,0.105388,0.0,0.0,0.0,0.0,0.0,0.229644,0.0,0.0,0.111079,0.0,0.060755,0.229644,0.115098,0.064225,0.238637,0.063036,0.14728,0.0,0.178978,0.0,0.147528,0.0,0.0,0.098352,0.0,0.101797,0.0,0.0,0.147528,0.36743,0.133183,0.0,0.0,0.0,0.051787,0.126073,0.0,0.063036,0.098352,0.0,0.0,0.0,0.0,0.0,0.0,0.05554,0.050028,0.0,0.0,0.0,0.064225,0.050898,0.091858,0.052694,0.047521,0.14728,0.0,0.049176,0.0,0.0,0.141458,0.070729,0.0,0.17459,0.056532,0.138691,0.0,0.050898,0.065448,0.315182,0.06188,0.063036,0.0,0.064225,0.333237,0.0,0.05554,0.050898,0.0,0.063036,0.145021,0.0,0.0,0.103573,0.0,0.0,0.0,0.045929,0.0,0.136012,0.203593,0.0,0.060755,0.0,0.045929,0.052694
1,0.0,0.261016,0.266097,0.0,0.0,0.0,0.0,0.0,0.0,0.179712,0.0,0.0,0.217318,0.0,0.237725,0.179712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179712,0.0,0.233437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242128,0.0,0.0,0.0,0.0,0.170786,0.0,0.0,0.0,0.0,0.0,0.246652,0.242128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.567446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.092436,0.157058,0.073018,0.0,0.0,0.026112,0.06046,0.0,0.106072,0.0,0.056125,0.025654,0.196017,0.224501,0.212143,0.026582,0.029665,0.055113,0.058233,0.068028,0.082669,0.027556,0.0,0.068143,0.0,0.246496,0.045428,0.0,0.02351,0.133321,0.0,0.136285,0.339429,0.102528,0.027556,0.029116,0.0,0.02392,0.058233,0.034014,0.0,0.022714,0.14352,0.0,0.0,0.0,0.061624,0.0,0.076961,0.046216,0.073018,0.024339,0.0,0.029665,0.094039,0.190929,0.097357,0.241448,0.102042,0.0,0.022714,0.0,0.0,0.130678,0.130678,0.121696,0.403213,0.026112,0.0,0.027556,0.117549,0.03023,0.058233,0.0,0.058233,0.0,0.059331,0.153921,0.0,0.025654,0.02351,0.03023,0.058233,0.089313,0.153921,0.090691,0.21528,0.0,0.116465,0.03203,0.021214,0.0,0.062823,0.070529,0.024339,0.0,0.0,0.063643,0.073018
3,0.0,0.092693,0.0,0.0,0.0,0.090942,0.117831,0.045471,0.053335,0.223369,0.0,0.084421,0.154348,0.0,0.084421,0.127639,0.039983,0.089243,0.0,0.175183,0.051163,0.082899,0.041449,0.059596,0.239161,0.048179,0.0,0.102498,0.17197,0.035363,0.100268,0.0,0.170829,0.382918,0.030844,0.124348,0.087591,0.045471,0.03598,0.0,0.0,0.0,0.0,0.03598,0.044621,0.085985,0.162828,0.0,0.295475,0.115761,0.034758,0.07322,0.07322,0.262774,0.0,0.106088,0.06382,0.0,0.066032,0.102325,0.0,0.0,0.042992,0.0,0.04914,0.0,0.183051,0.06065,0.039277,0.0,0.124348,0.14145,0.045471,0.0,0.0,0.131387,0.0,0.044621,0.154348,0.0,0.154348,0.070725,0.0,0.0,0.201512,0.0,0.090942,0.0,0.0,0.131387,0.048179,0.0,0.0,0.094497,0.0,0.03661,0.0,0.039277,0.095729,0.0
4,0.041334,0.039761,0.0,0.031408,0.0,0.0,0.067393,0.039011,0.091514,0.27376,0.0,0.0,0.033105,0.0,0.21728,0.109504,0.034302,0.153126,0.14224,0.075146,0.043893,0.0,0.0,0.0,0.234492,0.0,0.159045,0.029311,0.073768,0.060676,0.043011,0.0,0.087934,0.082128,0.026461,0.0,0.075146,0.039011,0.0,0.075146,0.0,0.150292,0.087934,0.12347,0.076563,0.073768,0.10477,0.119284,0.072427,0.099314,0.059639,0.0,0.0,0.075146,0.0,0.030338,0.054752,0.0,0.113301,0.0,0.10864,0.029311,0.0,0.040535,0.126475,0.0,0.125634,0.0,0.067393,0.0,0.03556,0.091014,0.039011,0.112719,0.0,0.112719,0.036884,0.0,0.463464,0.0,0.033105,0.091014,0.039011,0.037573,0.201694,0.066209,0.312084,0.12347,0.078021,0.075146,0.082667,0.164256,0.0,0.121605,0.030338,0.094225,0.0,0.0,0.054752,0.062817


## Inspecting tf-idf values

In [19]:
# isolate first row to examine
sample_row = tv_df.iloc[0, :]

# top 5 words
sample_row.sort_values(ascending=False).head()

TDIDF_government    0.367430
TDIDF_public        0.333237
TDIDF_present       0.315182
TDIDF_duty          0.238637
TDIDF_citizens      0.229644
Name: 0, dtype: float64

## Transforming unseen data

In [21]:
train_speech_df = speech_df.iloc[:45, :]
test_speech_df = speech_df.iloc[45:, :]

In [24]:
# instantiate
tv = TfidfVectorizer(max_features=100,
                     stop_words='english')

# fit and transform data
tv_transformed = tv.fit_transform(train_speech_df['text_clean'])

# transform test data
test_tv_transformed = tv.transform(test_speech_df['text_clean'])

# create new features for the test set
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(),
                          columns=tv.get_feature_names()).add_prefix('TFIDF_')
test_tv_df.head()

Unnamed: 0,TFIDF_action,TFIDF_administration,TFIDF_america,TFIDF_american,TFIDF_authority,TFIDF_best,TFIDF_business,TFIDF_citizens,TFIDF_commerce,TFIDF_common,TFIDF_confidence,TFIDF_congress,TFIDF_constitution,TFIDF_constitutional,TFIDF_country,TFIDF_day,TFIDF_duties,TFIDF_duty,TFIDF_equal,TFIDF_executive,TFIDF_faith,TFIDF_far,TFIDF_federal,TFIDF_fellow,TFIDF_force,TFIDF_foreign,TFIDF_free,TFIDF_freedom,TFIDF_future,TFIDF_general,TFIDF_given,TFIDF_god,TFIDF_good,TFIDF_government,TFIDF_great,TFIDF_high,TFIDF_hope,TFIDF_human,TFIDF_important,TFIDF_institutions,TFIDF_interests,TFIDF_just,TFIDF_justice,TFIDF_know,TFIDF_law,TFIDF_laws,TFIDF_let,TFIDF_liberty,TFIDF_life,TFIDF_long,TFIDF_make,TFIDF_man,TFIDF_means,TFIDF_men,TFIDF_nation,TFIDF_national,TFIDF_nations,TFIDF_necessary,TFIDF_new,TFIDF_office,TFIDF_order,TFIDF_ought,TFIDF_party,TFIDF_peace,TFIDF_people,TFIDF_policy,TFIDF_political,TFIDF_power,TFIDF_powers,TFIDF_present,TFIDF_principle,TFIDF_principles,TFIDF_progress,TFIDF_proper,TFIDF_prosperity,TFIDF_protection,TFIDF_public,TFIDF_purpose,TFIDF_question,TFIDF_republic,TFIDF_revenue,TFIDF_right,TFIDF_rights,TFIDF_secure,TFIDF_self,TFIDF_service,TFIDF_shall,TFIDF_spirit,TFIDF_state,TFIDF_states,TFIDF_subject,TFIDF_support,TFIDF_time,TFIDF_union,TFIDF_united,TFIDF_war,TFIDF_way,TFIDF_work,TFIDF_world,TFIDF_years
0,0.0,0.02954,0.233954,0.082703,0.0,0.0,0.0,0.022577,0.0,0.0,0.02635,0.0,0.02695,0.0,0.022577,0.02954,0.0,0.0,0.065003,0.0,0.03172,0.056409,0.0,0.049296,0.0,0.0,0.049296,0.066626,0.02635,0.0,0.030968,0.195008,0.024111,0.115378,0.11045,0.055135,0.07905,0.0,0.0,0.0,0.0,0.0252,0.025767,0.341578,0.02954,0.05908,0.348924,0.0,0.128835,0.055135,0.242549,0.309678,0.03172,0.024111,0.115378,0.0252,0.07076,0.0,0.2016,0.028861,0.034158,0.0,0.0,0.3162,0.3026,0.0,0.0,0.025767,0.0,0.0,0.0,0.0,0.030968,0.0,0.0,0.0,0.0,0.02954,0.0,0.0,0.0,0.0,0.0,0.030242,0.0,0.0,0.086457,0.165406,0.0,0.024648,0.0,0.0,0.115378,0.0,0.024648,0.07905,0.033313,0.0,0.299983,0.134749
1,0.0,0.0,0.547457,0.036862,0.0,0.036036,0.0,0.015094,0.0,0.0,0.017617,0.0,0.0,0.0,0.045283,0.01975,0.0,0.0,0.02173,0.0,0.08483,0.037714,0.0,0.016479,0.043459,0.0,0.0,0.089089,0.052851,0.0,0.020704,0.086919,0.01612,0.154278,0.13292,0.018431,0.035234,0.040438,0.043459,0.0,0.0,0.101089,0.017227,0.0,0.01975,0.0,0.466565,0.0,0.017227,0.073724,0.126126,0.0,0.0,0.0,0.169706,0.016848,0.047308,0.0,0.252722,0.0,0.022837,0.0,0.0,0.334722,0.086705,0.0,0.0,0.017227,0.018857,0.0,0.024041,0.0,0.103522,0.0,0.0,0.0,0.0,0.01975,0.024685,0.0,0.0,0.108108,0.01612,0.020219,0.0,0.0,0.101155,0.036862,0.0,0.0,0.0,0.019296,0.092567,0.0,0.0,0.052851,0.066817,0.078999,0.277701,0.126126
2,0.0,0.0,0.126987,0.134669,0.0,0.131652,0.0,0.0,0.0,0.046997,0.042907,0.0,0.0,0.0,0.036763,0.048102,0.045927,0.0,0.052924,0.0,0.103304,0.0,0.0,0.0,0.0,0.049244,0.040136,0.216981,0.085814,0.0,0.100853,0.052924,0.078521,0.150301,0.071941,0.08978,0.085814,0.24622,0.0,0.0,0.050426,0.164138,0.041958,0.166863,0.096203,0.048102,0.206608,0.08978,0.083915,0.04489,0.0,0.050426,0.0,0.0,0.488478,0.041034,0.038407,0.057055,0.36931,0.046997,0.0,0.0,0.0,0.042907,0.211174,0.0,0.0,0.0,0.0,0.0,0.0,0.093993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043884,0.117781,0.0,0.054245,0.0,0.0,0.269339,0.0,0.040136,0.0,0.0,0.075151,0.0,0.080272,0.042907,0.054245,0.096203,0.225452,0.043884
3,0.037094,0.067428,0.267012,0.031463,0.03999,0.061516,0.050085,0.077301,0.0,0.0,0.0,0.03999,0.030758,0.0,0.077301,0.134856,0.0,0.0,0.074188,0.0,0.108607,0.03219,0.183116,0.084393,0.0,0.0,0.056262,0.304162,0.09022,0.0,0.0,0.185469,0.027517,0.42138,0.100845,0.031463,0.060146,0.034515,0.037094,0.0,0.0,0.086282,0.029408,0.077969,0.0,0.0,0.217215,0.094389,0.058816,0.062926,0.092274,0.212061,0.14481,0.055035,0.158017,0.143804,0.026919,0.0,0.086282,0.0,0.038984,0.0,0.038984,0.060146,0.222015,0.0,0.092274,0.029408,0.03219,0.094389,0.0,0.03294,0.070687,0.0,0.0,0.0,0.029408,0.0,0.0,0.03999,0.0,0.030758,0.0,0.0,0.07604,0.0,0.024668,0.0,0.0,0.112524,0.0,0.098819,0.21069,0.0,0.056262,0.030073,0.03802,0.235998,0.237026,0.061516
4,0.0,0.0,0.221561,0.156644,0.028442,0.087505,0.0,0.109959,0.0,0.023428,0.021389,0.028442,0.0,0.0,0.018327,0.143872,0.0,0.0,0.026383,0.0,0.077246,0.0,0.162799,0.060023,0.0,0.0,0.060023,0.37858,0.042778,0.025138,0.025138,0.211061,0.019571,0.337164,0.089656,0.0,0.042778,0.220934,0.026383,0.0,0.0,0.020456,0.020916,0.027727,0.023979,0.0,0.18024,0.022378,0.041832,0.022378,0.065629,0.100551,0.0,0.058714,0.074925,0.122735,0.019146,0.0,0.184102,0.0,0.0,0.0,0.027727,0.171114,0.298266,0.0,0.043752,0.041832,0.0,0.022378,0.0,0.0,0.150826,0.0,0.0,0.0,0.0,0.023979,0.059941,0.028442,0.0,0.087505,0.019571,0.024548,0.108166,0.0,0.03509,0.044755,0.023428,0.060023,0.0,0.023428,0.187313,0.131913,0.040016,0.021389,0.081124,0.119894,0.299701,0.153133


## Using longer n-grams

In [25]:
# instantiate a trigram vectorizer
cv_trigram_vec = CountVectorizer(max_features=100,
                                 stop_words='english',
                                 ngram_range=(3, 3))

# fit and apply trigram vec
cv_trigram = cv_trigram_vec.fit_transform(speech_df['text_clean'])

# print the trigram features
print(cv_trigram_vec.get_feature_names())

['ability preserve protect', 'agriculture commerce manufactures', 'america ideal freedom', 'believe men right', 'best ability preserve', 'best interests country', 'bless god bless', 'bless united states', 'chief justice mr', 'children children children', 'citizens united states', 'civil religious liberty', 'civil service reform', 'commerce united states', 'confidence fellow citizens', 'congress extraordinary session', 'constitution does expressly', 'constitution united states', 'coordinate branches government', 'cultivate friendship nations', 'day task people', 'defend constitution united', 'distinguished guests fellow', 'does expressly say', 'dreams hopes goals', 'economy public expenditure', 'equal exact justice', 'era good feeling', 'executive branch government', 'faithfully execute office', 'fellow citizens assembled', 'fellow citizens called', 'fellow citizens large', 'fellow citizens world', 'form perfect union', 'general welfare secure', 'god bless america', 'god bless god', 'go

## Finding the most common words

In [26]:
# create a df of the featurees
cv_tri_df = pd.DataFrame(cv_trigram.toarray(),
                         columns=cv_trigram_vec.get_feature_names()).add_prefix('Counts_')

# top 5 words
cv_tri_df.sum().sort_values(ascending=False).head()

Counts_constitution united states    20
Counts_people united states          13
Counts_preserve protect defend       10
Counts_mr chief justice              10
Counts_president united states        8
dtype: int64