# Movie Data Analysis

This notebook contains a simple analysis of a movie dataset.

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Create sample dataset
data = {
    'title': ['The Shawshank Redemption', 'The Godfather', 'The Dark Knight', 'Pulp Fiction'],
    'year': [1994, 1972, 2008, 1994],
    'genre': ['Drama', 'Crime', 'Action', 'Crime'],
    'rating': [9.3, 9.2, 9.0, 8.9],
    'votes': [2600000, 1800000, 2500000, 2000000]
}

# Create DataFrame and save to CSV
df = pd.DataFrame(data)
df.to_csv('data/movies.csv', index=False)

# Display the first few rows
print("Dataset preview:")
display(df.head())

In [None]:
# Task 1: Basic information
print("\nDataset info:")
print(df.info())
print("\nDataset description:")
display(df.describe())

In [None]:
# Task 2: Average rating
average_rating = df['rating'].mean()
print(f"Average rating: {average_rating:.2f}")

In [None]:
# Task 3: Most common genre
most_common_genre = df['genre'].mode()[0]
print(f"Most common genre: {most_common_genre}")

In [None]:
# Task 4: Add decade column
df['decade'] = (df['year'] // 10) * 10
print("\nDataset with decade column:")
display(df)

In [None]:
# Task 5: Simple visualization
plt.figure(figsize=(10, 5))
plt.bar(df['title'], df['rating'], color='skyblue')
plt.title('Movie Ratings')
plt.xlabel('Movie')
plt.ylabel('Rating')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Next Steps
1. Try loading a larger dataset from Kaggle
2. Explore more pandas functions
3. Create more complex visualizations
4. Add more analysis (e.g., correlation between votes and ratings)