# Movie Data Cleaning

 # Clean a movie dataset that contains duplicate entries and inconsistent values for a genre column

# Step 1: Import Libraries

In [1]:
import pandas as pd


# Step 2: Create or Load the Dataset

In [2]:
# Sample movie dataset with duplicates and inconsistent genres
data = {
    'Movie_ID': [1, 2, 3, 4, 5, 2, 6, 7, 8, 9],
    'Title': ['kaththi', 'Asuran', 'Arasan', 'pizza', 'psycho', 'Mersal', 
              'Eleven', 'Bigil', 'sullan', 'Kodi'],
    'Genre': ['Action', 'Romance', 'action', 'Sci-Fi', 'Drama', 'ROMANCE', 
              'Animation', 'animation', 'Action', 'ACTION']
}

movies = pd.DataFrame(data)


# Step 3: Inspect the Data

In [3]:
print("Initial Dataset:")
print(movies)

print("\nDataset Info:")
print(movies.info())

print("\nCheck for Duplicates:")
print(movies.duplicated().sum())


Initial Dataset:
   Movie_ID    Title      Genre
0         1  kaththi     Action
1         2   Asuran    Romance
2         3   Arasan     action
3         4    pizza     Sci-Fi
4         5   psycho      Drama
5         2   Mersal    ROMANCE
6         6   Eleven  Animation
7         7    Bigil  animation
8         8   sullan     Action
9         9     Kodi     ACTION

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  10 non-null     int64 
 1   Title     10 non-null     object
 2   Genre     10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes
None

Check for Duplicates:
0


# Step 4: Remove Duplicate Entries

In [4]:
# Remove complete duplicates
movies = movies.drop_duplicates()

# If you only want to keep unique titles
movies = movies.drop_duplicates(subset='Title', keep='first')

print("\nAfter Removing Duplicates:")
print(movies)



After Removing Duplicates:
   Movie_ID    Title      Genre
0         1  kaththi     Action
1         2   Asuran    Romance
2         3   Arasan     action
3         4    pizza     Sci-Fi
4         5   psycho      Drama
5         2   Mersal    ROMANCE
6         6   Eleven  Animation
7         7    Bigil  animation
8         8   sullan     Action
9         9     Kodi     ACTION


# Step 5: Handle Inconsistent Genre Values

In [5]:
# Convert all genres to lowercase
movies['Genre'] = movies['Genre'].str.lower()

# Optionally, title-case them for neatness
movies['Genre'] = movies['Genre'].str.title()

print("\nAfter Standardizing Genre Names:")
print(movies['Genre'].unique())



After Standardizing Genre Names:
['Action' 'Romance' 'Sci-Fi' 'Drama' 'Animation']


# Step 6: Verify Cleaned Data

In [6]:
print("\nCleaned Dataset:")
print(movies)

print("\nUnique Genres After Cleaning:")
print(movies['Genre'].unique())



Cleaned Dataset:
   Movie_ID    Title      Genre
0         1  kaththi     Action
1         2   Asuran    Romance
2         3   Arasan     Action
3         4    pizza     Sci-Fi
4         5   psycho      Drama
5         2   Mersal    Romance
6         6   Eleven  Animation
7         7    Bigil  Animation
8         8   sullan     Action
9         9     Kodi     Action

Unique Genres After Cleaning:
['Action' 'Romance' 'Sci-Fi' 'Drama' 'Animation']
