# Bonus Chronicling America Analysis

In [None]:
# imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import matplotlib.pyplot as plt

In [None]:
# read data from csv
df = pd.read_csv('../data/socalism1945-1963.csv')

In [None]:
# change date to date-time object
df['date'] = pd.to_datetime(df['date'])

In [None]:
# create new column with just year
df['year'] = df['date'].dt.year

In [None]:
# isolate columns for analysis
small_df = df[['year', 'lemmas']]

In [None]:
# group the dataframe by year and join lemmas
grouped = small_df.groupby('year')['lemmas'].apply(','.join).reset_index()
print(grouped.head())

## TF-IDF 

In [None]:
# reshape data for sklearn
years_list = []
lemmas_list = []
for index, row in grouped.iterrows():
    years_list.append(str(row['year']))  # make the year a string
    lemmas_list.append(row['lemmas'])

In [None]:
# create tf-idf model
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, 
                             use_idf= True, norm=None)
transformed_lemmas = vectorizer.fit_transform(lemmas_list)
transformed_lemmas_as_array = transformed_lemmas.toarray()

In [None]:
# organize results
key_terms_by_year = {}
for lemmas, year in zip(transformed_lemmas_as_array, years_list):
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), lemmas))
    sorted_tf_idf_tuples = sorted(tf_idf_tuples, key= lambda x: x[1], reverse=True)
    k = year
    v = sorted_tf_idf_tuples[:10]  # only getting the top ten
    key_terms_by_year[k] = v

In [None]:
# dispaly results
for k, v in key_terms_by_year.items():
    result = k + ' => ' + v[0][0] + ', ' + v[1][0] + ', ' + v[2][0] + ', ' + v[3][0] + ', ' + v[4][0] + ', ' + v[5][0]+ ', ' + v[6][0] + ', ' + v[7][0] + ', ' + v[8][0] + ', ' + v[9][0]
    print(result)

## Conditional frequency distribution

In [None]:
# reshape data 
terms_by_year = {}
for year, terms in zip(years_list, lemmas_list):
    terms_list = terms.split(' ')
    terms_by_year[year] = terms_list

In [None]:
# conditional frequency distribution
# Note: I adapted these lines of code from the NLTK
key_words = ['proletariat', 'bourgeois']  # <-- instert token(s) to explore (lowercase)
cfd = nltk.ConditionalFreqDist(
    (key_word, year)
    for year in terms_by_year.keys()
    for lemma in terms_by_year[year]
    for key_word in key_words
    if lemma.lower() == key_word
)

In [None]:
# display plot
plt.figure(figsize=(20, 8))  # this expands the plot to make it more readable
cfd.plot()