In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
df = pd.read_csv('final_movies.csv')
df.head()

Unnamed: 0,title,released_year,rated_class,run_time,stars,total_ratings,genre,summary
0,Madame Web,2024.0,PG,1h 56m,3.8,28K,Action|Adventure|Sci-Fi,Cassandra Webb is a New York metropolis parame...
1,Borderlands,2024.0,,1h 42m,,,Action|Adventure|Comedy,The popular video game set on the abandoned fi...
2,Oppenheimer,2023.0,14A,3h,8.4,654K,Biography|Drama|History,The story of American scientist J. Robert Oppe...
3,Dune: Part Two,2024.0,PG,2h 46m,9.1,28K,Action|Adventure|Drama,Paul Atreides unites with Chani and the Fremen...
4,Poor Things,2023.0,18A,2h 21m,8.3,115K,Comedy|Drama|Romance,The incredible tale about the fantastical evol...


In [3]:
# inspecting rows and columns of our dataset
df.shape

(3650, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3650 entries, 0 to 3649
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          3650 non-null   object 
 1   released_year  3580 non-null   float64
 2   rated_class    3415 non-null   object 
 3   run_time       3516 non-null   object 
 4   stars          3486 non-null   float64
 5   total_ratings  3486 non-null   object 
 6   genre          3616 non-null   object 
 7   summary        3613 non-null   object 
dtypes: float64(2), object(6)
memory usage: 228.3+ KB


We can see that every columns except title has some missing values

In [5]:
# calculating the missing percentage in each columns
(df.isnull().mean())* 100

title            0.000000
released_year    1.917808
rated_class      6.438356
run_time         3.671233
stars            4.493151
total_ratings    4.493151
genre            0.931507
summary          1.013699
dtype: float64

## Data cleaning and preprocessing

In [10]:
class Utils:
    """
    Utility wrapper class that provides functionality for cleaning Helper class
    """
    
    number_pattern = re.compile(r'[0-9]*')
    
    @staticmethod
    def convert_to_minutes(run_time):
        minutes = 0
        x = run_time.split(' ')
        match = re.match(Utils.number_pattern, x[0])
        s = match.start()
        e = match.end()
        hours = int(x[0][s:e])
        minutes += int(match[0])* 60
        if len(x) > 1:
            match = re.match(Utils.number_pattern, x[1])
            s = match.start()
            e = match.end()
            mins = int(x[1][s:e])
            minutes += int(mins)
        return minutes


class DataCleaningHelper:

    @staticmethod
    def convert_total_rating(item):
        if type(item) != float:
            if '.' in item:
                item = item.replace('K', '00')
                item = item.replace('M', '00000')
            else:
                item = item.replace('K', '000')
                item = item.replace('M', '000000')
            item = item.replace('.', '')
            return int(item)
        return np.nan

    @staticmethod
    def categorize_run_time(item):
        if type(item) == str:
            minutes = Utils.convert_to_minutes(item)
            if minutes < 90:
                return 'very short'
            elif minutes < 120:
                return 'short'
            elif minutes < 150:
                return 'medium'
            elif minutes < 180:
                return 'long'
            return 'extended'
                
        else:
            return np.nan
    

In [13]:
# df['total_ratings'] = df['total_ratings'].apply(DataCleaningHelper.convert_total_rating)
# df['run_time'] = df['run_time'].apply(DataCleaningHelper.categorize_run_time)

In [None]:
# df.head()

In [14]:
df['rated_class'].value_counts()

rated_class
PG           1049
14A          1024
18A           339
R             303
G             236
13+            67
16+            65
18+            56
PG-13          54
TV-MA          52
Not Rated      38
PA             23
AA             19
TV-14          16
14+            14
14             14
A               9
Unrated         8
TV-PG           7
(Banned)        7
18              4
Approved        4
TV-G            3
TV-Y7           1
RX              1
Passed          1
7+              1
Name: count, dtype: int64