# Movie Data cleaning

Clean a movie dataset by handling missing values and converting data
types.

import libraray

In [3]:
import pandas as pd
import numpy as np

# 1. Load Data: Create a sample DataFrame with missing and incorrectly typed values

In [7]:
data = {
    'Title': ['Movie A', 'Movie B', 'Movie C', 'Movie D', 'Movie E'],
    'Rating': [8.5, 7.2, 5.6, 9.1, 6.8],
    'Release_Year': ['2015', '2018', '2020', '2019', 2022],
    'Genre': ['Action', 'Comedy', 'Drama', 'Action', 'Comedy']
}
movies_df = pd.DataFrame(data)

print("Original DataFrame Info:")
movies_df.info()


Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         5 non-null      object 
 1   Rating        5 non-null      float64
 2   Release_Year  5 non-null      object 
 3   Genre         5 non-null      object 
dtypes: float64(1), object(3)
memory usage: 292.0+ bytes


# 2. Check for Missing Values

In [8]:
print("\n2:")
print(movies_df.isnull().sum())


2:
Title           0
Rating          0
Release_Year    0
Genre           0
dtype: int64


# 3. Handle Missing Data

# Handle 'Rating': Replace NaN values with the median rating

In [6]:
median_rating = movies_df['Rating'].median()

movies_df['Rating'].fillna(median_rating, inplace=True)

print(f"\nMissing 'Rating' values replaced with median: {median_rating}")


Missing 'Rating' values replaced with median: 7.85


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_df['Rating'].fillna(median_rating, inplace=True)


# Handle 'Release_Year': Remove rows where 'Release_Year' is missing

In [9]:
movies_df.dropna(subset=['Release_Year'], inplace=True)
print("\nRows with missing 'Release_Year' removed.")


Rows with missing 'Release_Year' removed.


# 4. Correct Data Type: Convert 'Release_Year' to an integer

In [10]:
movies_df['Release_Year'] = movies_df['Release_Year'].astype(int)

# 5. Output: Print the final DataFrame info to show the changes

In [11]:
print("\nFinal DataFrame Info after Cleaning:")
movies_df.info()

print("\nCleaned DataFrame:")
print(movies_df)



Final DataFrame Info after Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         5 non-null      object 
 1   Rating        5 non-null      float64
 2   Release_Year  5 non-null      int64  
 3   Genre         5 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 292.0+ bytes

Cleaned DataFrame:
     Title  Rating  Release_Year   Genre
0  Movie A     8.5          2015  Action
1  Movie B     7.2          2018  Comedy
2  Movie C     5.6          2020   Drama
3  Movie D     9.1          2019  Action
4  Movie E     6.8          2022  Comedy
