## <span style="color:pink">Import libraries</span>

In [None]:
import pandas as pd
import numpy as np 
import re

## <span style="color:pink">Reading the datasets</span>
### Parisa

In [None]:
meta_clean = pd.read_excel('/Users/parisa/Desktop/AI/Final_Assignment/metaClean.xlsx')
meta_clean.info()

In [None]:
meta_clean.head(10)

## <span style="color:pink">Cleaning the datasets</span>
### Parisa

### The title column is cleaned by extracting the movie title from the url column. 
### This improves the dataset's readability and ensures the title column is clean for analysis.

In [None]:
# Extract title from url
def extract_title(url):
    # Get the part after the last '/'
    raw_title = url.split('/')[-1]
    # Replace hyphens with spaces
    title = raw_title.replace('-', ' ')
    # Capitalize the first letter of each word and retain special formatting
    return re.sub(r'(\d+:\d+)|\w+', lambda m: m.group().title(), title)

# Apply the improved function to the 'url' column
meta_clean['title'] = meta_clean['url'].apply(extract_title)

# Save the updated dataset
meta_clean.to_excel('/Users/parisa/Desktop/AI/Final_Assignment/metaClean.xlsx', index=False)

print(meta_clean[['url', 'title']].head(10))


### The rating column is cleaned by:

#### 1-Removing extra symbols like | and unnecessary spaces.
#### 2-Replacing missing values with "Not Rated" to ensure consistency.
#### 3-Standardizing the format for easier analysis.

In [17]:
# Remove unwanted symbols and clean formatting
meta_clean['rating'] = meta_clean['rating'].str.replace('|', '', regex=False).str.strip()

# Handle missing values
meta_clean['rating'] = meta_clean['rating'].fillna('Not Rated')

# Verify the cleaned column
print(meta_clean['rating'].value_counts())

# Save the updated dataset
meta_clean.to_excel('/Users/parisa/Desktop/AI/Final_Assignment/metaClean.xlsx', index=False)


rating
Not Rated    4027
R            3515
PG-13        2025
PG            816
Unrated       384
TV-MA         200
NR            128
G             124
TV-14          56
NC-17          40
TV-PG          23
TV-G            7
PG--13          5
Open            5
Approved        4
M               2
MA-17           1
PG-13`          1
M/PG            1
Name: count, dtype: int64


### The runtime column is cleaned by:

#### Filling missing values with the median 

In [18]:
# Replace missing valuse with median
median_runtime = meta_clean['runtime'].median()
meta_clean['runtime'] = meta_clean['runtime'].fillna(median_runtime)

meta_clean.to_excel('/Users/parisa/Desktop/AI/Final_Assignment/metaClean.xlsx', index=False)

print(meta_clean.head())

                                                 url                  title  \
0  https://www.metacritic.com/movie/!women-art-re...  !Women Art Revolution   
1  https://www.metacritic.com/movie/10-cloverfiel...    10 Cloverfield Lane   
2  https://www.metacritic.com/movie/10-items-or-less       10 Items Or Less   
3          https://www.metacritic.com/movie/10-years               10 Years   
4  https://www.metacritic.com/movie/100-bloody-acres       100 Bloody Acres   

                     studio     rating  runtime  \
0       Hotwire Productions  Not Rated     83.0   
1        Paramount Pictures      PG-13    104.0   
2                Click Star          R     82.0   
3  Anchor Bay Entertainment          R    100.0   
4           Music Box Films  Not Rated     91.0   

                                                cast              director  \
0                                                NaN  Lynn Hershman-Leeson   
1  John Gallagher Jr.,John Goodman,Mary Elizabeth...      Dan 