In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Load the CSV file into a pandas DataFrame
csv_file_path = 'preprocessing/preprocessed_sample.csv'
data = pd.read_csv(csv_file_path)

# Drop rows with NaN values in the 'corrected_review' column
data = data.dropna(subset=['corrected_review'])

# Convert the processed reviews to a list of strings
documents = list(data['corrected_review'])

# Calculate sentiment scores for each review
sentiment_scores = [TextBlob(review).sentiment.polarity for review in documents]

# Separate reviews into positive and negative based on sentiment scores
positive_reviews = [review for review, score in zip(documents, sentiment_scores) if score > 0]
negative_reviews = [review for review, score in zip(documents, sentiment_scores) if score < 0]

# Create a CountVectorizer to extract frequent features from positive reviews
positive_vectorizer = CountVectorizer(max_features=100, max_df=0.8, stop_words='english')
positive_X = positive_vectorizer.fit_transform(positive_reviews)

# Create a CountVectorizer to extract frequent features from negative reviews
negative_vectorizer = CountVectorizer(max_features=100, max_df=0.8, stop_words='english')
negative_X = negative_vectorizer.fit_transform(negative_reviews)

# Get the feature names for positive and negative reviews
positive_feature_names = positive_vectorizer.get_feature_names_out()
negative_feature_names = negative_vectorizer.get_feature_names_out()

# Display the most frequent positive and negative features
print("Most Frequent Positive Features:")
print(positive_feature_names)
print("=" * 40)
print("Most Frequent Negative Features:")
print(negative_feature_names)


Most Frequent Positive Features:
['12' '13' '14' 'alway' 'amaze' 'android' 'apply' 'awesome' 'backup'
 'battery' 'best' 'better' 'build' 'buy' 'camera' 'charge' 'color'
 'colour' 'come' 'compare' 'conduit' 'day' 'deal' 'deliver' 'design'
 'devil' 'differ' 'display' 'dream' 'everyth' 'excel' 'expect' 'expert'
 'fast' 'feature' 'feel' 'fine' 'flippant' 'genuine' 'good' 'got' 'great'
 'hands' 'happy' 'heat' 'improv' 'io' 'issue' 'lag' 'life' 'light' 'like'
 'look' 'lot' 'love' 'low' 'mobil' 'money' 'need' 'new' 'nice' 'ok'
 'order' 'overall' 'perfect' 'perform' 'phone' 'photo' 'pictur' 'power'
 'premium' 'price' 'pro' 'product' 'purchase' 'quality' 'ram' 'rang'
 'really' 'say' 'service' 'siphon' 'smooth' 'sound' 'super' 'superb'
 'switch' 'thank' 'thing' 'time' 'upgrade' 'use' 'value' 'video' 'want'
 'word' 'work' 'worth' 'wow' 'yellow']
Most Frequent Negative Features:
['10' '14' 'android' 'apply' 'average' 'bad' 'battery' 'build' 'camera'
 'compare' 'complaint' 'crazy' 'custom' 'damage'