In [5]:
import re
import os
import nltk
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords  # Make sure to import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [33]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [6]:

def feature_importance(file_path,N):
    # Read the content of the text file
    with open(file_path, "r", encoding="utf-8") as file:
        corpus = [file.read()]

    # Assuming 'corpus' is a list of text documents for a particular year
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)

    # Get feature names and corresponding TF-IDF scores
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = X.sum(axis=0).A1

    # Create a dictionary of terms and their TF-IDF scores
    term_tfidf = dict(zip(feature_names, tfidf_scores))
    top_keywords = sorted(term_tfidf.items(), key=lambda x: x[1], reverse=True)[:N]

    #print(top_keywords)

    return top_keywords

In [25]:
file_path = '/Users/praharshita/Desktop/Capstone/word-embedding-files/1918-1927.txt'
features=feature_importance(file_path,200)
df = pd.DataFrame(features, columns=['Feature', 'Importance'])
print(df)
df.to_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1918-1927).csv', index=False)

        Feature  Importance
0           hon    0.300230
1    government    0.230630
2         house    0.203093
3         think    0.176115
4      question    0.167270
..          ...         ...
195    military    0.023670
196      debate    0.023540
197        coal    0.023531
198     figures    0.023480
199     dealing    0.023455

[200 rows x 2 columns]


In [26]:
# Example usage
file_path = '/Users/praharshita/Desktop/Capstone/word-embedding-files/1928-1938.txt'
features=feature_importance(file_path,200)
#print(feature_importance(file_path,40))
df = pd.DataFrame(features, columns=['Feature', 'Importance'])
print(df)
df.to_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1928-1938).csv', index=False)

        Feature  Importance
0           hon    0.402699
1    government    0.218142
2        member    0.185402
3         right    0.178004
4         house    0.172600
..          ...         ...
195   interests    0.023242
196        sure    0.023095
197     support    0.023061
198   insurance    0.023005
199       prime    0.022999

[200 rows x 2 columns]


In [27]:
# Example usage
file_path = '/Users/praharshita/Desktop/Capstone/word-embedding-files/2000-2006.txt'
features=feature_importance(file_path,200)
#print(feature_importance(file_path,40))
df = pd.DataFrame(features, columns=['Feature', 'Importance'])
print(df)
df.to_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(2000-2006).csv', index=False)

        Feature  Importance
0    government    0.272106
1         state    0.218094
2     secretary    0.217014
3           hon    0.200842
4           ask    0.196474
..          ...         ...
195       child    0.026746
196  understand    0.026390
197    baroness    0.026084
198     housing    0.025966
199      months    0.025946

[200 rows x 2 columns]


In [28]:
# Example usage
file_path = '/Users/praharshita/Desktop/Capstone/word-embedding-files/1800-1899.txt'
features=feature_importance(file_path,200)
#print(feature_importance(file_path,40))
df = pd.DataFrame(features, columns=['Feature', 'Importance'])
print(df)
df.to_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1800-1899).csv', index=False)

        Feature  Importance
0           hon    0.361738
1         house    0.299669
2    government    0.209759
3          said    0.199012
4         right    0.177512
..          ...         ...
195      ground    0.022453
196  impossible    0.022410
197       small    0.022358
198        read    0.022276
199      little    0.022032

[200 rows x 2 columns]


In [29]:
# Example usage
file_path = '/Users/praharshita/Desktop/Capstone/word-embedding-files/1900-1999.txt'
features=feature_importance(file_path,200)
#print(feature_importance(file_path,40))
df = pd.DataFrame(features, columns=['Feature', 'Importance'])
print(df)
df.to_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1900-1999).csv', index=False)

        Feature  Importance
0           hon    0.354247
1    government    0.224782
2     secretary    0.196639
3         state    0.196035
4         right    0.157945
..          ...         ...
195       terms    0.024343
196      making    0.024305
197        kind    0.024195
198      reason    0.024044
199   following    0.023998

[200 rows x 2 columns]


In [49]:
df_1800_1899 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1800-1899).csv')
df_1900_1999 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1900-1999).csv')

In [51]:
merged_df = pd.merge(df_1800_1899, df_1900_1999, on='Feature', how='inner', suffixes=('1800_1899', '1900_1999'))
merged_df['importance_change'] = abs(merged_df['Importance1800_1899'] - merged_df['Importance1900_1999'])

# Sort the DataFrame by the importance change in descending order
merged_df.sort_values(by='importance_change', ascending=False, inplace=True)

# Display the top N words with the greatest change in importance scores
top_n_words = 100  # You can adjust this based on your preference
print(merged_df.head(top_n_words)[['Feature','Importance1800_1899', 'Importance1900_1999']])
merged_df.to_csv('/Users/praharshita/Desktop/Capstone/results/language_change(1800s-1900s).csv', index=False)


       Feature  Importance1800_1899  Importance1900_1999
1        house             0.299669             0.140958
46   secretary             0.049492             0.196639
3         said             0.199012             0.089871
19       state             0.094296             0.196035
70       asked             0.037151             0.134066
..         ...                  ...                  ...
90         pay             0.028505             0.034005
81        long             0.030225             0.035422
66     certain             0.039595             0.034536
101     period             0.025927             0.030468
117      small             0.022358             0.026833

[100 rows x 3 columns]


In [52]:
df_1900_1999 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1900-1999).csv')
df_2000_2006 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(2000-2006).csv')

In [53]:
merged_df = pd.merge(df_1900_1999, df_2000_2006, on='Feature', how='inner', suffixes=('1900_1999', '2000_2006'))
merged_df['importance_change'] = abs(merged_df['Importance1900_1999'] - merged_df['Importance2000_2006'])

# Sort the DataFrame by the importance change in descending order
merged_df.sort_values(by='importance_change', ascending=False, inplace=True)

# Display the top N words with the greatest change in importance scores
top_n_words = 100  # You can adjust this based on your preference
print(merged_df.head(top_n_words)[['Feature','Importance1900_1999', 'Importance2000_2006']])
merged_df.to_csv('/Users/praharshita/Desktop/Capstone/results/language_change(1900s-2000s).csv', index=False)


         Feature  Importance1900_1999  Importance2000_2006
0            hon             0.354247             0.200842
18           ask             0.092588             0.196474
8          asked             0.134066             0.031036
16         think             0.095592             0.033943
5         member             0.146170             0.092620
..           ...                  ...                  ...
105      ireland             0.034369             0.039463
17         years             0.092830             0.087749
102          tax             0.034784             0.039681
49   authorities             0.060403             0.055685
92         total             0.037364             0.032663

[100 rows x 3 columns]


In [54]:
df_1918_1927 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1918-1927).csv')
df_1928_1938 = pd.read_csv('/Users/praharshita/Desktop/Capstone/results/feature_importance(1928-1938).csv')

In [56]:
merged_df = pd.merge(df_1900_1999, df_2000_2006, on='Feature', how='inner', suffixes=('1918_1927', '1928_1938'))
merged_df['importance_change'] = abs(merged_df['Importance1918_1927'] - merged_df['Importance1928_1938'])

# Sort the DataFrame by the importance change in descending order
merged_df.sort_values(by='importance_change', ascending=False, inplace=True)

# Display the top N words with the greatest change in importance scores
top_n_words = 100  # You can adjust this based on your preference
print(merged_df.head(top_n_words)[['Feature','Importance1918_1927', 'Importance1928_1938']])
merged_df.to_csv('/Users/praharshita/Desktop/Capstone/results/language_change_before_after_suffrage).csv', index=False)


         Feature  Importance1918_1927  Importance1928_1938
0            hon             0.354247             0.200842
18           ask             0.092588             0.196474
8          asked             0.134066             0.031036
16         think             0.095592             0.033943
5         member             0.146170             0.092620
..           ...                  ...                  ...
105      ireland             0.034369             0.039463
17         years             0.092830             0.087749
102          tax             0.034784             0.039681
49   authorities             0.060403             0.055685
92         total             0.037364             0.032663

[100 rows x 3 columns]
