In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import boto3
import json
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/madhavbpanicker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/madhavbpanicker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Load the CSV file with scraped pages
file_path = 'web_pages.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Ensure the HTML column exists
html_column = 'page_content'  # Replace with the actual column name containing HTML data
if html_column not in data.columns:
    raise ValueError(f"The specified column '{html_column}' does not exist in the CSV file.")

# Fill missing or invalid HTML values with empty strings
data[html_column] = data[html_column].fillna('')
html_data = data[html_column]


In [5]:
#Extract Paragraphs Containing Reviews
all_reviews = []
def extract_reviews(html):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = soup.find_all('p')
    reviews = []
    for p in paragraphs:
        text = p.get_text().strip()
        if text:  # Non-empty text
            reviews.append(text)
    return reviews

# Process all HTML data
for html in html_data:
    all_reviews.extend(extract_reviews(html))


In [6]:
#Perform Sentiment Analysis
sentiments = []
for review in all_reviews:
    analysis = TextBlob(review)
    sentiments.append(analysis.sentiment.polarity)

# Calculate the average sentiment
average_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
print(f"Average Sentiment: {average_sentiment}")

### Cell 5: Extract Keywords
keywords = []
def extract_keywords(text):
    words = [word.lower() for word in TextBlob(text).words if word.lower() not in stop_words and word.isalpha()]
    return words

# Process reviews to extract keywords
for review in all_reviews:
    keywords.extend(extract_keywords(review))

# Count keyword occurrences
keyword_counts = Counter(keywords)

Average Sentiment: 0.17844992397832807


In [7]:
#Word Cloud generation
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(keyword_counts)

# Display the word cloud
#plt.figure(figsize=(10, 5))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis('off')
#plt.title('Word Cloud for Hyundai Reviews')
#plt.show()

# Save word cloud image
wordcloud_image_path = 'wordcloud.png'
wordcloud.to_file(wordcloud_image_path)


<wordcloud.wordcloud.WordCloud at 0x7e49af9a8c80>

In [8]:
# Define the S3 bucket and paths
s3_bucket = 'madhavbpanicker'  # Replace with your S3 bucket name
results_path = 'results/'

# Initialize S3 client
s3 = boto3.client('s3')

# Save sentiment results and keywords
results = {
    'average_sentiment': average_sentiment,
    'keywords': keyword_counts
}
results_json = json.dumps(results)

# Upload results JSON
results_json_path = 'sentiment_results.json'
with open(results_json_path, 'w') as f:
    f.write(results_json)

s3.upload_file(results_json_path, s3_bucket, f'{results_path}{results_json_path}')

# Upload word cloud image
s3.upload_file(wordcloud_image_path, s3_bucket, f'{results_path}{wordcloud_image_path}')

print("Results uploaded to S3.")

Results uploaded to S3.
