In [2]:
import re

def extract_sentences_with_keywords(text, keywords):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    filtered_sentences = [
        sentence for sentence in sentences 
        if any(re.search(rf'\b{re.escape(keyword)}\b', sentence, re.IGNORECASE) for keyword in keywords)
    ]
    
    return '\n'.join(filtered_sentences)


In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from concurrent.futures import ThreadPoolExecutor
import nltk

nltk.download('stopwords')

keywords_to_match = ['she', 'her', 'hers', 'lady', 'ladies', 'woman', 'women', 'girl', 'girls']

stop_words = set(stopwords.words('english'))

# Specify the directory containing your files
directory = '/Users/praharshita/Desktop/Capstone/cleaned_files_by_year'

# List of words to be excluded
exclude_words = ['hon', 'lord', 'said', "'s", 'would', 'could', 'house', 'mr', 'government', 'state', 'noble',
                 'secretary', 'people', "''", 'upon', 'whether', 'made', 'many', 'make', 'ask', 'one', 'may', "mr."]
exclude_words.extend(stop_words)

# Initialize an empty DataFrame to store results
df_results = pd.DataFrame(columns=['Year', 'Top Words'])

def process_file(filename):
    # Initialize a Counter to store word counts
    word_counts = Counter()

    # Read the content of the file
    file_path = os.path.join(directory, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        print(file_path)
        content = file.read()
        pronoun_content = extract_sentences_with_keywords(content, keywords_to_match)

    # Tokenize and apply any necessary preprocessing
    # You can add more preprocessing steps based on your specific needs
    tokens = pronoun_content.lower().split()

    # Exclude specified words
    tokens = [token for token in tokens if token not in exclude_words]

    # Update word counts for the current year
    word_counts.update(tokens)

    # Get the top 5 words for the current year
    top_words_year = [word for word, count in word_counts.most_common(10)]

    return {'Year': filename[:-4], 'Top Words': top_words_year}

# Use ThreadPoolExecutor to parallelize file processing
with ThreadPoolExecutor() as executor:
    # Iterate through files in the directory
    results = list(executor.map(process_file, [filename for filename in os.listdir(directory) if filename.endswith('.txt')]))

# Append the results to the DataFrame
df_results = df_results.append(results, ignore_index=True)
df_results.to_csv('//Users/praharshita/Desktop/Capstone/results/top_words_woman_results.csv', index=False)

# Print the results
#print(df_results)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/praharshita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1906.txt/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1912.txt

/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/2002.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1866.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1899.txt/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1873.txt

/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1872.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1898.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1867.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/2003.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1913.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1907.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1939.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1905.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1911

/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1955.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1941.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1969.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1821.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1835.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1809.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1808.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1834.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1820.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1968.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1940.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1954.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1983.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1997.txt
/Users/praharshita/Desktop/Capstone/cleaned_files_by_year/1933

  df_results = df_results.append(results, ignore_index=True)


In [6]:
df_results = df_results.sort_values(by='Year')
df_results.to_csv('//Users/praharshita/Desktop/Capstone/results/top_words_woman_results.csv', index=False)


In [7]:
len(pronoun_content)

NameError: name 'pronoun_content' is not defined