# Day4_Pandas_Exploration.ipynb
# Content: Pandas Data Exploration & Handling Missing Data

---

## Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Step 2: Load IMDB Dataset

In [2]:
imdb_df = pd.read_csv("/content/imdb_top_1000.csv")

## Step 3: Inspect the Dataset

In [3]:
print("First 5 rows:\n", imdb_df.head())
print("\nLast 5 rows:\n", imdb_df.tail())
print("\nShape of dataset:", imdb_df.shape)
print("\nColumn names:", imdb_df.columns)
print("\nInfo:")
print(imdb_df.info())
print("\nSummary statistics:\n", imdb_df.describe())

First 5 rows:
                                          Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   

               Series_Title Released_Year Certificate  Runtime  \
0  The Shawshank Redemption          1994           A  142 min   
1             The Godfather          1972           A  175 min   
2           The Dark Knight          2008          UA  152 min   
3    The Godfather: Part II          1974           A  202 min   
4              12 Angry Men          1957           U   96 min   

                  Genre  IMDB_Rating  \
0                 Drama          9.3   
1          Crime, Drama          9.2   
2  Action, Crime, Drama          9.0   
3          Crime, Drama          9.0   
4          Crime, Drama          9.0   

          

## Step 4: Handling Missing Data

In [6]:
missing_values = imdb_df.isnull().sum()
print("\nMissing values per column:\n", missing_values)

# Option 1: Drop rows with missing values
imdb_cleaned_drop = imdb_df.dropna()
print("\nAfter dropping missing rows, shape:", imdb_cleaned_drop.shape)

# Option 2: Fill missing values (example: fill 'IMDB_Rating' missing with 0)
imdb_filled = imdb_df.copy()
imdb_filled['IMDB_Rating'] = imdb_filled['IMDB_Rating'].fillna(0)
print("\nAfter filling missing ratings with 0:\n", imdb_filled['IMDB_Rating'].head())


Missing values per column:
 Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

After dropping missing rows, shape: (714, 16)

After filling missing ratings with 0:
 0    9.3
1    9.2
2    9.0
3    9.0
4    9.0
Name: IMDB_Rating, dtype: float64


## Step 5: Selecting and Filtering Data

In [9]:
ratings = imdb_df['IMDB_Rating']
print("\nRatings column:\n", ratings.head())

# Select multiple columns
subset = imdb_df[['Series_Title', 'Genre', 'IMDB_Rating']]
print("\nSubset of columns:\n", subset.head())

# Filter rows: movies with rating > 8
high_rated = imdb_df[imdb_df['IMDB_Rating'] > 8]
print("\nMovies with rating > 8:\n", high_rated.head())

# Filter by year: movies after 2015
numeric_years = pd.to_numeric(imdb_df['Released_Year'], errors='coerce')
recent_movies = imdb_df[numeric_years.notna() & (numeric_years > 2015)]
print("\nMovies released after 2015:\n", recent_movies.head())


Ratings column:
 0    9.3
1    9.2
2    9.0
3    9.0
4    9.0
Name: IMDB_Rating, dtype: float64

Subset of columns:
                Series_Title                 Genre  IMDB_Rating
0  The Shawshank Redemption                 Drama          9.3
1             The Godfather          Crime, Drama          9.2
2           The Dark Knight  Action, Crime, Drama          9.0
3    The Godfather: Part II          Crime, Drama          9.0
4              12 Angry Men          Crime, Drama          9.0

Movies with rating > 8:
                                          Poster_Link  \
0  https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1  https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2  https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3  https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4  https://m.media-amazon.com/images/M/MV5BMWU4N2...   

               Series_Title Released_Year Certificate  Runtime  \
0  The Shawshank Redemption          1994           A  142 min   
1        

## Step 6: Grouping and Aggregation

In [12]:
# Average rating per genre
avg_rating_genre = imdb_df.groupby('Genre')['IMDB_Rating'].mean().sort_values(ascending=False)
print("\nAverage rating per genre:\n", avg_rating_genre)

# Count of movies per genre
count_per_genre = imdb_df['Genre'].value_counts()
print("\nNumber of movies per genre:\n", count_per_genre)


Average rating per genre:
 Genre
Animation, Drama, War         8.50
Action, Sci-Fi                8.40
Drama, Musical                8.40
Drama, Mystery, War           8.35
Western                       8.35
                              ... 
Action, Adventure, Mystery    7.60
Action, Adventure, Family     7.60
Action, Adventure, Crime      7.60
Animation, Drama, Romance     7.60
Drama, War, Western           7.60
Name: IMDB_Rating, Length: 202, dtype: float64

Number of movies per genre:
 Genre
Drama                        85
Drama, Romance               37
Comedy, Drama                35
Comedy, Drama, Romance       31
Action, Crime, Drama         30
                             ..
Action, Adventure, Family     1
Action, Crime, Mystery        1
Animation, Drama, Romance     1
Drama, War, Western           1
Adventure, Comedy, War        1
Name: count, Length: 202, dtype: int64


## Step 7: Practice / Exercises

In [13]:
# 1. Find top 10 highest rated movies
top_10 = imdb_df.sort_values('IMDB_Rating', ascending=False).head(10)
print("\nTop 10 highest rated movies:\n", top_10[['Series_Title', 'IMDB_Rating', 'Genre']])

# 2. Filter movies with missing 'Genre' and fill it as 'Unknown'
imdb_df['Genre'].fillna('Unknown', inplace=True)
print("\nMovies with filled missing Genre:\n", imdb_df['Genre'].head())

# 3. Filter movies with Rating > 7 and Year after 2010
numeric_years = pd.to_numeric(imdb_df['Released_Year'], errors='coerce')
filtered_movies = imdb_df[(imdb_df['IMDB_Rating'] > 7) & (numeric_years > 2010)]
print("\nMovies with Rating > 7 and Year > 2010:\n", filtered_movies.head())


Top 10 highest rated movies:
                                      Series_Title  IMDB_Rating  \
0                        The Shawshank Redemption          9.3   
1                                   The Godfather          9.2   
4                                    12 Angry Men          9.0   
2                                 The Dark Knight          9.0   
3                          The Godfather: Part II          9.0   
5   The Lord of the Rings: The Return of the King          8.9   
7                                Schindler's List          8.9   
6                                    Pulp Fiction          8.9   
8                                       Inception          8.8   
12                Il buono, il brutto, il cattivo          8.8   

                        Genre  
0                       Drama  
1                Crime, Drama  
4                Crime, Drama  
2        Action, Crime, Drama  
3                Crime, Drama  
5    Action, Adventure, Drama  
7   Biography, Dra