### Imports


In [339]:
import pandas as pd
from scipy.stats import zscore, skew

### Defining important methods

In [340]:
def print_initial_df(df: pd.DataFrame) -> None:
    print("Initial DataFrame:\n")
    print(df.head())  # Print only the first 5 rows


# Function to clean job titles
def clean_job_title(title: str) -> str:
    return title.split('(')[0].split('in office')[0].strip()

def delete_column_df(df: pd.DataFrame, column_name: str) -> pd.DataFrame:

    if column_name in df.columns: 
        return df.drop(columns=[column_name])
    else:
        print(f"⚠️ Warning: Column '{column_name}' not found in DataFrame. Skipping drop.")
        return df

def count_amount_of_rows_in_df(df: pd.DataFrame) -> None:

    num_rows = df.shape[0]
    print(f"The DataFrame has {num_rows} rows.\n")

def count_nulls_in_columns(df: pd.DataFrame) -> None:
    """Prints the number of null values in each column."""
    null_counts = df.isnull().sum()
    print("Number of null values in each column:")
    for column, null_count in null_counts.items():
        print(f"Column: '{column}' has {null_count} null values. \n")

### Initial dataframe

In [341]:
movies_df = pd.read_csv("netflix_list.csv", index_col=0)
print_initial_df(movies_df.head())

Initial DataFrame:

                         title popular_rank certificate  startYear  endYear  \
imdb_id                                                                       
tt4052886              Lucifer            1          15     2016.0      NaN   
tt0993840     Army of the Dead            2          18     2021.0      NaN   
tt7255502  The Kominsky Method            3          18     2018.0   2021.0   
tt0108778              Friends            4         13+     1994.0   2004.0   
tt9251798             Ragnarok            5          18     2020.0      NaN   

           episodes runtime      type  orign_country   language  \
imdb_id                                                           
tt4052886      93.0      42  tvSeries  United States    English   
tt0993840       NaN     148     movie  United States    English   
tt7255502      22.0      30  tvSeries  United States    English   
tt0108778     235.0      22  tvSeries  United States    English   
tt9251798      12.0     

### Make copy of the initial dataset

In [342]:
movies_df_copy = movies_df.copy()
print("Copy of the DataFrame:\n")
print(movies_df_copy.head()) 

Copy of the DataFrame:

                         title popular_rank certificate  startYear  endYear  \
imdb_id                                                                       
tt4052886              Lucifer            1          15     2016.0      NaN   
tt0993840     Army of the Dead            2          18     2021.0      NaN   
tt7255502  The Kominsky Method            3          18     2018.0   2021.0   
tt0108778              Friends            4         13+     1994.0   2004.0   
tt9251798             Ragnarok            5          18     2020.0      NaN   

           episodes runtime      type  orign_country   language  \
imdb_id                                                           
tt4052886      93.0      42  tvSeries  United States    English   
tt0993840       NaN     148     movie  United States    English   
tt7255502      22.0      30  tvSeries  United States    English   
tt0108778     235.0      22  tvSeries  United States    English   
tt9251798      12.0 

### Check percentage of null values in numVotes column
I want to cluster this dataset in series/movies with a high number of votes. Therefore I need to check the percentage of how many null values there are in this column

In [343]:
number_of_null_values_in_num_Votes = movies_df_copy["numVotes"].isnull().sum()
total_rows = len(movies_df_copy)
percentage_null_num_Votes = (number_of_null_values_in_num_Votes / total_rows) * 100

print("The percentage of null values that numVotes column has is " + str(percentage_null_num_Votes) + "%")

The percentage of null values that numVotes column has is 14.383561643835616%


### Check the type of numVotes column
Since only 14% of the values of this column are null I will treat tream them. But first I want to check the type of this column to decide how I am going to replace these values

In [344]:
print("The type of the numVotes column is " + str(movies_df_copy["numVotes"].dtype))

The type of the numVotes column is float64


### Convert column type to integer and delete float values

In [345]:
# Filter the 'numVotes' column to exclude NaN values and identify values with decimals
num_votes_with_decimals = movies_df_copy[movies_df_copy["numVotes"].notna() & (movies_df_copy["numVotes"] % 1 != 0)]

# Count the number of non-null decimal values in 'numVotes'
decimal_count = num_votes_with_decimals.shape[0]

# Count the number of zero values (excluding NaNs) in 'numVotes'. Important to check so later I do not confuse them with Nan values when I convert it to an integer column
zero_count = len(movies_df_copy[movies_df_copy["numVotes"].notna() & (movies_df_copy["numVotes"] == 0)])

print("The amount of decimal values in 'numVotes' is: " + str(decimal_count))
print("The amount of zero values in 'numVotes' before filtering is: " + str(zero_count))

The amount of decimal values in 'numVotes' is: 0
The amount of zero values in 'numVotes' before filtering is: 0


### Change column type of numVotes to integer
Since there are no decimal values in this column, I am going to change the type of the column to integer

In [346]:
# Replace NaN values with 0 and convert the column to integers
movies_df_copy["numVotes"] = movies_df_copy["numVotes"].fillna(0).astype(int)

print(movies_df_copy["numVotes"].head())


imdb_id
tt4052886    250884
tt0993840    110780
tt7255502     28795
tt0108778    861843
tt9251798     26606
Name: numVotes, dtype: int64


### Checking skewness in the numVotes column
Before taking a decision on how I will replace the null values, I want to understand the skewness of the column

In [347]:
# Filter out 0 values before calculating skewness since they are missing values and they do not bring any insight to the skewness
non_zero_votes = movies_df_copy[movies_df_copy['numVotes'] > 0]['numVotes']

# Calculate skewness only for non-zero values
skewness_value = skew(non_zero_votes)
print("Skewness (Ignoring Nan):", skewness_value)

Skewness (Ignoring Nan): 10.037454384886306


### Understanding the skewness result in numVotes column
The numVotes column is highly right-skewed (10.04), meaning most shows have very few votes, while a few popular ones have extremely high votes. Therefore, I decide to fill up the null values of this column using the Random Forest algorithm because it can learn patterns from other features in the dataset.

To use the mean to fill up these null values would not be intelligent because it is sensitive to outliers(This column is highly skewed). The imputed values would probably be biased toward the central tendency, which does not reflect the actual patterns of the data. 

To use the median imputation would make more since it is less sensitive to outliers and it is a better choice when data is skewed. However, it still ignores the relationship between numVotes and other features of the dataset

First thing i need to check is the correlation between every feature and the correlation of every feature with the numVotes column(the target)


In [348]:
# Calculate correlation with the target 'numVotes'
corr_with_target = movies_df_copy['numVotes'].sort_values(ascending=False)

# Display top 5 most correlated features with 'numVotes'
print("Most correlated features with numVotes:")
print(corr_with_target.head())

Most correlated features with numVotes:
imdb_id
tt0120737     1697849
tt0167260     1677017
tt0903747     1523446
tt0167261     1516370
tt0407887     1216541
               ...   
tt3336458           0
tt3083016           0
tt11834150          0
tt12304420          0
tt9245008           0
Name: numVotes, Length: 7008, dtype: int64
