In [None]:
# Install dependencies
!pip install datasets
!pip install pandas
!pip install seaborn
!pip install datasets pandas seaborn nltk
!pip install nltk



In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Download stopwords and tokenizer from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Load the IMDB dataset
imdb = load_dataset('imdb')

# Convert the dataset into a pandas DataFrame for easier manipulation
df = pd.DataFrame(imdb['train'])

# Add a column for text length
df['text_length'] = df['text'].apply(len)

# Map labels to 'negative' and 'positive' for easier interpretation
df['sentiment'] = df['label'].map({0: 'negative', 1: 'positive'})





In [None]:
# Convert the dataset into a pandas DataFrame for easier manipulation
df = pd.DataFrame(imdb['train'])


In [None]:
# Display the first few rows of the DataFrame
print(df.head())


In [None]:
# Display basic statistical descriptions
print(df.describe())


In [None]:
# Count the number of each type of review (positive/negative)
sentiment_counts = df['label'].value_counts()
print(sentiment_counts)


In [None]:
# Plot the distribution of sentiments
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()


In [None]:
# Add a column for text length
df['text_length'] = df['text'].apply(len)


In [None]:
# Plot the distribution of text lengths
plt.figure(figsize=(10, 8))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


In [None]:
# Drop duplicates
df = df.drop_duplicates()
print("Dataframe shape after dropping duplicates:", df.shape)


In [None]:
# Ensure that the 'sentiment' column is added properly
df['sentiment'] = df['label'].map({0: 'negative', 1: 'positive'})

# Verify that the column exists
print(df.head())  # This will show the first few rows of the DataFrame including the 'sentiment' column



In [None]:
# Plotting the boxplot for text length by sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='text_length', data=df)
plt.title('Boxplot of Text Lengths by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Text Length')
plt.show()



In [None]:
# Ensure nltk resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Function to plot word frequency
def plot_word_frequency(texts, title):
    words = ' '.join(texts).lower()
    words = word_tokenize(words)
    words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]

    freq_dist = nltk.FreqDist(words)
    freq_dist = pd.DataFrame(freq_dist.most_common(20), columns=['Word', 'Frequency'])

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Frequency', y='Word', data=freq_dist)
    plt.title(f'Top 20 Words in {title}')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.show()

# Example of how to use the function
# Ensure that this line is in the same cell or that the previous cells have been executed in the current session
plot_word_frequency(df[df['label'] == 0]['text'], 'Negative Reviews')
plot_word_frequency(df[df['label'] == 1]['text'], 'Positive Reviews')

