In [18]:
import pandas as pd

# 1. Load dataset
df = pd.read_csv('imdb_top_1000.csv')

# 2. Select relevant features
selected_features = ['Series_Title', 'Genre', 'Director', 
                    'Overview', 'IMDB_Rating']

movies_df = df[selected_features].copy()

# 3. Handle missing values
# Drop rows with missing values in critical fields
movies_df.dropna(subset=['Series_Title', 'Genre', 'Director', 
                         'Overview', 'IMDB_Rating'], 
                inplace=True)

# 4. Clean text data
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return text

# Apply cleaning to text columns
text_columns = ['Series_Title', 'Genre', 'Director','Overview']

for col in text_columns:
    movies_df[col] = movies_df[col].apply(clean_text)

# Select final columns
final_columns = ['Series_Title', 'Genre', 'Director', 
                'Overview', 'IMDB_Rating']

movies_df = movies_df[final_columns]



In [6]:
def movies_you_might_like(title) -> dict:
    if title and len(title) >= 4:
        title_filter =movies_df['Series_Title'].str.contains(title, case=False,na=False)
        matched_movies = movies_df[title_filter]
        if matched_movies.empty:
            return {'error': 'No movies found with that title try to use the movie genre'}
        #getting genres from the first matching movie
        input_movie = matched_movies.iloc[0]
        input_genre = set(input_movie['Genre'].split(', '))
        
        other_movies = movies_df[~title_filter].copy()
        other_movies['genres_overlap']= other_movies['Genre'].apply(
            lambda x:len(set(x.split(', ')) & input_genre))
        recommendations = other_movies[other_movies['genres_overlap'] >0]
        # recommendations = recommendations.sort_values(
        #     by=['genres_overlap', 'IMDB_Rating'],
        #     ascending=[False, False]
        # )
        return{
            'movies you might like': recommendations[['Series_Title', 'Genre',]].to_dict("records")
        }
movies_you_might_like('coco')       

{'movies you might like': [{'Series_Title': 'the lord of the rings: the return of the king',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'inception', 'Genre': 'action, adventure, sci-fi'},
  {'Series_Title': 'the lord of the rings: the fellowship of the ring',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'the lord of the rings: the two towers',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': 'star wars: episode v - the empire strikes back',
   'Genre': 'action, adventure, fantasy'},
  {'Series_Title': 'interstellar', 'Genre': 'adventure, drama, sci-fi'},
  {'Series_Title': 'sen to chihiro no kamikakushi',
   'Genre': 'animation, adventure, family'},
  {'Series_Title': 'star wars', 'Genre': 'action, adventure, fantasy'},
  {'Series_Title': 'shichinin no samurai',
   'Genre': 'action, adventure, drama'},
  {'Series_Title': "it's a wonderful life", 'Genre': 'drama, family, fantasy'},
  {'Series_Title': 'gladiator', 'Genre': 'action, adventure, dr

# Data Preparation Practice Tasks  

## 1. **Data Inspection & Profiling**  
- Load the dataset and display the first 10 rows  
- Check data types of all columns using `.dtypes`  
- Generate summary statistics with `.describe(include='all')`  
- Identify columns with missing values using `.isnull().sum()`  

## 2. **Handling Duplicates**  
- Find duplicate rows using `.duplicated().sum()`  
- Remove duplicates while keeping the first occurrence  
- Verify the dataset size before/after deduplication  

## 3. **Missing Value Treatment**  
- Fill missing directors with "Unknown"  
- Drop rows with missing runtime values  
- Impute missing genres using the mode  
- Fill missing Meta_scores with column median  

## 4. **Outlier Detection & Correction**  
- Identify movies with IMDB_Rating > 10 or < 0  
- Handle invalid runtimes (e.g., "999 min", "10hrs")  
- Use boxplots to detect outliers in `No_of_Votes`  
- Correct negative/invalid Gross values  

## 5. **Format Standardization**  
### Runtime:  
- Extract numeric values (e.g., "100 mins" → 100)  
- Convert to integer and standardize to minutes  

### Year:  
- Remove "(re-release)" and other suffixes  
- Convert to 4-digit integer (e.g., "1998 Original" → 1998)  

### Votes:  
- Remove commas and "K" abbreviations (e.g., "1.5K" → 1500)  
- Convert to integer  

### Meta_score:  
- Remove "/10" suffix and convert to numeric  

### Gross:  
- Remove "$", "USD", and commas  
- Convert to numeric (handle "million" as 1,000,000)  

## 6. **Categorical Data Handling**  
- Normalize genre capitalization (e.g., "drama" → "Drama")  
- Correct typos in genres (e.g., "Dramaa" → "Drama")  
- Standardize certificates (e.g., "PG13" → "PG-13")  
- One-hot encode certificate categories  

## 7. **Data Type Conversion**  
- Convert Runtime, Year, Votes, Meta_score, and Gross to numeric types  
- Ensure Genre and Certificate are categorical types  

## 8. **Complex Pattern Handling**  
- Parse mixed-format runtimes (e.g., "2h30m" → 150)  
- Handle currency conversions (e.g., "€2.5M" → 2,500,000)  

## 9. **Final Validation**  
- Check for remaining missing values  
- Verify no invalid ratings (0 ≤ IMDB_Rating ≤ 10)  
- Ensure all runtime values are reasonable (30 ≤ Runtime ≤ 300)  
- Confirm standardized date formats  
 

data=pd.read_csv("")


In [47]:
#selecting 10 rows
modified_csv = pd.read_csv('modified_imdb_top_1000.csv')
#below here type your code...
specific_rows = df.iloc[[0,1,2,3,4,5,6,7,8,9,]]
specific_rows

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
5,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905
6,https://m.media-amazon.com/images/M/MV5BNGNhMD...,Pulp Fiction,1994,A,154 min,"Crime, Drama",8.9,"The lives of two mob hitmen, a boxer, a gangst...",94.0,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762
7,https://m.media-amazon.com/images/M/MV5BNDE4OT...,Schindler's List,1993,A,195 min,"Biography, Drama, History",8.9,"In German-occupied Poland during World War II,...",94.0,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505,96898818
8,https://m.media-amazon.com/images/M/MV5BMjAxMz...,Inception,2010,UA,148 min,"Action, Adventure, Sci-Fi",8.8,A thief who steals corporate secrets through t...,74.0,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195
9,https://m.media-amazon.com/images/M/MV5BMmEzNT...,Fight Club,1999,A,139 min,Drama,8.8,An insomniac office worker and a devil-may-car...,66.0,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854740,37030102


In [34]:
#checking the data type
modified_csv = pd.read_csv('modified_imdb_top_1000.csv')
modified_csv.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score        object
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes       object
Gross             object
dtype: object

In [48]:
modified_csv.describe(include='all')

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
count,1010,1010,1010.0,908,1006,1003,1010.0,1010,845,1000,1010,1010,1010,1010,1010,840
unique,1000,999,303.0,19,142,204,,1000,201,542,660,841,891,939,999,835
top,https://m.media-amazon.com/images/M/MV5BYmJhZm...,Drishyam,2014.0,U,101 min,Drama,,Desperate measures are taken by a man who trie...,800.0%,Alfred Hitchcock,Tom Hanks,Emma Watson,Samuel L. Jackson,Michael Caine,31K,"$739,478"
freq,2,3,23.0,235,25,84,,2,13,14,12,7,5,5,3,2
mean,,,,,,,7.98703,,,,,,,,,
std,,,,,,,0.893041,,,,,,,,,
min,,,,,,,-1.0,,,,,,,,,
25%,,,,,,,7.7,,,,,,,,,
50%,,,,,,,7.9,,,,,,,,,
75%,,,,,,,8.1,,,,,,,,,


In [49]:
#modified_csv.dropna(inplace=True)
modified_csv.isnull().sum()
#modified_csv.size

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      102
Runtime            4
Genre              7
IMDB_Rating        0
Overview           0
Meta_score       165
Director          10
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            170
dtype: int64

In [46]:
modified_csv.duplicated().sum()
modified_csv.drop_duplicates(subset='Series_Title', keep='first')
modified_csv.size
# display(len(modified_csv))

11376

Fill missing directors with "Unknown"
Drop rows with missing runtime values
Impute missing genres using the mode
Fill missing Meta_scores with column median

In [54]:
 modified_csv['Runtime'].isnull().sum()

np.int64(0)

In [51]:
# Fill missing values in the 'directors' column with 'Unknown'
modified_csv['Director']=modified_csv['Director'].fillna("Unknown")

# Save the updated CSV file
modified_csv.to_csv("modified_imdb_top_1000.csv", index=False)
#modified_csv.query("Director=='Unknown'",inplace=True)
#modified_csv

In [57]:
modified_csv['Genre'].isnull().sum()

np.int64(0)

In [53]:
modified_csv=modified_csv.dropna(subset=['Runtime'])
modified_csv.to_csv("modified_imdb_top_1000.csv", index=False)

In [58]:
#Impute missing genres using the mode
mode_genre = modified_csv['Genre'].mode()[0]  # mode() returns a list, so we take the first value

# Fill missing values with the mode
modified_csv['Genre'].fillna(mode_genre, inplace=True)


In [63]:
modified_csv['Meta_score'].isnull().sum()

np.int64(0)

In [62]:

# Step 1: Clean the Meta_score column (removing non-numeric characters)
modified_csv['Meta_score'] = modified_csv['Meta_score'].replace(r'[^0-9.]', '', regex=True)

# Step 2: Convert to numeric type
modified_csv['Meta_score'] = pd.to_numeric(modified_csv['Meta_score'], errors='coerce')

# Step 3: Find the median
median_meta_score = modified_csv['Meta_score'].median()

# Step 4: Fill missing values with the median
modified_csv['Meta_score'] = modified_csv['Meta_score'].fillna(median_meta_score)


4. Outlier Detection & Correction
   1. Identify movies with IMDB_Rating > 10 or < 0
   2. Handle invalid runtimes (e.g., "999 min", "10hrs")
   3. Use boxplots to detect outliers in No_of_Votes
   4. Correct negative/invalid Gross values

In [None]:
modified_csv