In [None]:
from datasets import load_dataset

# Load the IMDB dataset
dataset = load_dataset("imdb")
train_data = dataset['train']
test_data = dataset['test']

# Check the number of rows in the training and test sets
len(train_data), len(test_data)

In [None]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect('imdb_reviews.db')
cursor = conn.cursor()

# Create the table
cursor.execute('''
    CREATE TABLE imdb_reviews (
        id INTEGER PRIMARY KEY,
        review_text TEXT,
        sentiment TEXT
    )
''')
conn.commit()

In [None]:
# Prepare data for insertion
reviews = [(i, review['text'], review['label']) for i, review in enumerate(train_data)]

# Insert data into the table
cursor.executemany('INSERT INTO imdb_reviews (id, review_text, sentiment) VALUES (?, ?, ?)', reviews)
conn.commit()

In [None]:
import pandas as pd
import re

# Load the data into a DataFrame for cleanup
df = pd.read_sql_query("SELECT * FROM imdb_reviews", conn)

# Clean the review text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['cleaned_review_text'] = df['review_text'].apply(clean_text)

In [None]:
# Distribution of sentiments
sentiment_counts = df['sentiment'].value_counts()

# Average review length
df['review_length'] = df['cleaned_review_text'].apply(len)
avg_length_by_sentiment = df.groupby('sentiment')['review_length'].mean()

sentiment_counts, avg_length_by_sentiment