In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

# List all files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load dataset and describe
file_path = "/kaggle/input/imdb-dataset-of-top-1000-movies-and-tv-shows/imdb_top_1000.csv"
data = pd.read_csv(file_path)

# Clean data
data['Released_Year'] = pd.to_numeric(data['Released_Year'], errors='coerce')
data = data.dropna(subset=['Released_Year']) # Remove entries with non-numeric Released_Year
data['Gross'] = data['Gross'].str.replace(',', '').astype(float) # Cast Gross to int ($)
data['Runtime'] = data['Runtime'].str.extract('(\d+)').astype(int) # Cast Runtime to int (min)

# Describe data
print(f"\nNumber of observations: {len(data)}") # Num of observations
print("\nFeatures:") # Display features
for column in data.columns:
    print(column)
print("\nDescriptive statistics on numerical features:") #Display statistics
numerical_stats = data.describe()
print(numerical_stats)

# Create a 2x3 grid of boxplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

for idx, col in enumerate(numerical_stats.columns):
    row, col_idx = divmod(idx, 3)
    ax = axes[row, col_idx]
    ax.boxplot(data[col].dropna().values)
    ax.set_title(f'{col}')
    ax.set_ylabel(col)
    ax.set_xticks([])
    
plt.tight_layout()
plt.show()

# Compute correlation matrix
correlation_matrix = data[numerical_stats.columns].corr()

# Visualize with a heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', cbar=True, square=True, fmt='.2f')
plt.title('Correlation Matrix')
plt.show()