In [None]:
# Install dependencies
!pip install datasets
!pip install pandas
!pip install seaborn
!pip install datasets pandas seaborn nltk
!pip install nltk
!pip install wordcloud




In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud



# Download stopwords and tokenizer from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Load the IMDB dataset
imdb = load_dataset('imdb')

# Convert the dataset into a pandas DataFrame for easier manipulation
df = pd.DataFrame(imdb['train'])

# Add a column for text length
df['text_length'] = df['text'].apply(len)

# Map labels to 'negative' and 'positive' for easier interpretation
df['sentiment'] = df['label'].map({0: 'negative', 1: 'positive'})





In [None]:
# Convert the dataset into a pandas DataFrame for easier manipulation
df = pd.DataFrame(imdb['train'])


In [None]:
# Display the first few rows of the DataFrame
print(df.head())


In [None]:
# Display basic statistical descriptions
print(df.describe())


In [None]:
# Count the number of each type of review (positive/negative)
sentiment_counts = df['label'].value_counts()
print(sentiment_counts)


In [None]:
# Add a column for text length
df['text_length'] = df['text'].apply(len)


In [None]:
# Plot the distribution of text lengths
plt.figure(figsize=(10, 8))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


In [None]:
# Drop duplicates
df = df.drop_duplicates()
print("Dataframe shape after dropping duplicates:", df.shape)


In [None]:
# Ensure that the 'sentiment' column is added properly
df['sentiment'] = df['label'].map({0: 'negative', 1: 'positive'})

# Verify that the column exists
print(df.head())  # This will show the first few rows of the DataFrame including the 'sentiment' column



In [None]:
# Plotting the boxplot for text length by sentiment
plt.figure(figsize=(10, 6))
sns.boxplot(x='sentiment', y='text_length', data=df)
plt.title('Boxplot of Text Lengths by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Text Length')
plt.show()



In [None]:
# This cell performs an analysis that may take some time to complete.
# You may choose not to run this cell if you prefer.

# Check if the wordcloud library is installed
try:
    import wordcloud
except ImportError:
    print("The wordcloud library is not installed. The word cloud analysis will not be executed.")
    wordcloud_installed = False
else:
    wordcloud_installed = True

# Function to generate word cloud (optional)
def generate_wordcloud(texts, title):
    if not wordcloud_installed:
        return
    words = ' '.join(texts).lower()
    words = ' '.join([word for word in words.split() if word not in stopwords.words('english')])

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(words)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud of {title}')
    plt.axis('off')
    plt.show()

# Example of how to use the function (optional)
if wordcloud_installed:
    generate_wordcloud(df[df['label'] == 0]['text'], 'Negative Reviews')
    generate_wordcloud(df[df['label'] == 1]['text'], 'Positive Reviews')




