In [None]:
import pandas as pd

In [None]:
# Step 1: Data Acquisition
df = pd.read_csv('your_dataset.csv')

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Step 2: Text Cleaning
# Removing duplicates
df = df.drop_duplicates(subset=['title', 'body'])
# Removing null entries
df = df.dropna(subset=['title', 'body'])
# Removing stop words, punctuation, and special characters
stop_words = set(stopwords.words('english'))
df['cleaned_title'] = df['title'].apply(lambda x: ' '.join([word for word in word_tokenize(str(x)) if word.lower() not in stop_words and word.isalnum()]))
df['cleaned_body'] = df['body'].apply(lambda x: ' '.join([word for word in word_tokenize(str(x)) if word.lower() not in stop_words and word.isalnum()]))

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# Step 3: Preprocessing
# Tokenization
df['tokenized_title'] = df['cleaned_title'].apply(lambda x: word_tokenize(x))
df['tokenized_body'] = df['cleaned_body'].apply(lambda x: word_tokenize(x))
# Stemming
ps = PorterStemmer()
df['stemmed_title'] = df['tokenized_title'].apply(lambda x: [ps.stem(word) for word in x])
df['stemmed_body'] = df['tokenized_body'].apply(lambda x: [ps.stem(word) for word in x])
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_title'] = df['tokenized_title'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['lemmatized_body'] = df['tokenized_body'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [None]:
from sklearn.preprocessing import LabelEncoder
# Mapping ratings to sentiments (1-2: Negative, 3: Neutral, 4-5: Positive)
df['sentiment'] = df['rating'].apply(lambda x: 'negative' if x in [1, 2] else 'neutral' if x == 3 else 'positive')
# Label encoding
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])
# Concatenating title and body into a single text field
df['text'] = df['cleaned_title'] + ' ' + df['cleaned_body']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Step 4: Exploratory Data Analysis (EDA)
# Distribution of Ratings
plt.figure(figsize=(8,6))
sns.countplot(x='rating', data=df)
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

In [None]:
# Sentiment Distribution
plt.figure(figsize=(8,6))
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
from wordcloud import WordCloud
# Word Cloud for Title
title_text = ' '.join(df['cleaned_title'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(title_text)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Title')
plt.axis('off')
plt.show()

In [None]:
# Word Cloud for Body
body_text = ' '.join(df['cleaned_body'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(body_text)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Body')
plt.axis('off')
plt.show()

In [None]:
from collections import Counter
# Top 10 Most Common Words in Title
title_words = ' '.join(df['cleaned_title'].dropna()).split()
title_word_counts = Counter(title_words)
top_10_title_words = title_word_counts.most_common(10)
top_10_title_words

In [None]:
# Top 10 Most Common Words in Body
body_words = ' '.join(df['cleaned_body'].dropna()).split()
body_word_counts = Counter(body_words)
top_10_body_words = body_word_counts.most_common(10)
top_10_body_words

In [None]:
# Distribution of Helpful Votes
plt.figure(figsize=(8,6))
sns.histplot(df['helpfulVotes'], kde=True)
plt.title('Distribution of Helpful Votes')
plt.xlabel('Helpful Votes')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of Text Lengths
df['text_length'] = df['text'].apply(len)