In [None]:
from datasets import load_dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the IMDB dataset
imdb = load_dataset('imdb')

# Convert the dataset to a pandas DataFrame for easier manipulation
df = pd.DataFrame(imdb['train'])

# Display the first few rows of the DataFrame
print(df.head())


In [None]:
# Display basic statistical descriptions
print(df.describe())

# Count the number of each type of review (positive/negative)
sentiment_counts = df['label'].value_counts()
print(sentiment_counts)

# Plot the distribution of sentiments
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()


In [None]:
# Add a column for text length
df['text_length'] = df['text'].apply(len)

# Plot the distribution of text lengths
plt.figure(figsize=(10, 8))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Drop duplicates
df = df.drop_duplicates()
print("Dataframe shape after dropping duplicates:", df.shape)
