In [96]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
movies = pd.read_csv(r'E:\NED Office\Courses\Fall 2024\PITP - Data Science\Datasets\imdb-top-1000.csv')

In [3]:
movies.shape

(1000, 10)

In [4]:
movies.describe(include='all')

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
count,1000,1000.0,1000.0,1000,1000.0,1000,1000,1000.0,1000.0,843.0
unique,999,100.0,,14,,548,660,,,
top,Drishyam,2014.0,,Drama,,Alfred Hitchcock,Tom Hanks,,,
freq,2,32.0,,289,,14,12,,,
mean,,,122.891,,7.9493,,,273692.9,128278100.0,77.97153
std,,,28.093671,,0.275491,,,327372.7,201111000.0,12.376099
min,,,45.0,,7.6,,,25088.0,1305.0,28.0
25%,,,103.0,,7.7,,,55526.25,5012919.0,70.0
50%,,,119.0,,7.9,,,138548.5,39534320.0,79.0
75%,,,137.0,,8.1,,,374161.2,161599700.0,87.0


# Data preprocessing 

It is a crucial step in the machine learning pipeline, as the quality of the data greatly impacts the performance of models. Preprocessing involves cleaning, transforming, and encoding data to ensure it is suitable for analysis or machine learning.

The key steps in data preprocessing include:
- Handling missing data
- Handling duplicated data
- Feature scaling
- Encoding categorical variables
- Handling outliers

#### Handling Missing Data

Real-world datasets often contain missing values, which can impact model performance. There are several techniques to handle missing data:
- **Dropping Missing Values**: Remove rows or columns with missing data.
- **Imputing Missing Values**: Replace missing values using statistical measures such as mean, median, or mode.

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   1000 non-null   object 
 1   Released_Year  1000 non-null   object 
 2   Runtime        1000 non-null   int64  
 3   Genre          1000 non-null   object 
 4   IMDB_Rating    1000 non-null   float64
 5   Director       1000 non-null   object 
 6   Star1          1000 non-null   object 
 7   No_of_Votes    1000 non-null   int64  
 8   Gross          1000 non-null   float64
 9   Metascore      843 non-null    float64
dtypes: float64(3), int64(2), object(5)
memory usage: 78.3+ KB


- .isna() and .isnull(): Both are used to detect missing values (NaNs) in the data and return True for missing values.
- .notna() and .notnull(): Both are used to detect non-missing values and return True for values that exist.

In [6]:
movies.isna()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False


In [7]:
movies.notna()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,True,True,True,True,True,True
996,True,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True,True,True


In [8]:
movies.isnull()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False


In [9]:
movies.notnull()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,True,True,True,True,True,True
996,True,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True,True,True


In [11]:
#Now use it with .sum()

In [13]:
movies.notnull().sum()

Series_Title     1000
Released_Year    1000
Runtime          1000
Genre            1000
IMDB_Rating      1000
Director         1000
Star1            1000
No_of_Votes      1000
Gross            1000
Metascore         843
dtype: int64

In [14]:
movies.dropna()


Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,Comedy,7.6,Blake Edwards,Audrey Hepburn,166544,679874270.0,76.0
996,Giant,1956,201,Drama,7.6,George Stevens,Elizabeth Taylor,34075,195217415.0,84.0
997,From Here to Eternity,1953,118,Drama,7.6,Fred Zinnemann,Burt Lancaster,43374,30500000.0,85.0
998,Lifeboat,1944,97,Drama,7.6,Alfred Hitchcock,Tallulah Bankhead,26471,852142728.0,78.0


In [15]:
#movies.dropna(inplace=True)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Series_Title   1000 non-null   object 
 1   Released_Year  1000 non-null   object 
 2   Runtime        1000 non-null   int64  
 3   Genre          1000 non-null   object 
 4   IMDB_Rating    1000 non-null   float64
 5   Director       1000 non-null   object 
 6   Star1          1000 non-null   object 
 7   No_of_Votes    1000 non-null   int64  
 8   Gross          1000 non-null   float64
 9   Metascore      843 non-null    float64
dtypes: float64(3), int64(2), object(5)
memory usage: 78.3+ KB


In [16]:
movies['Metascore']

0       80.0
1      100.0
2       84.0
3       90.0
4       96.0
       ...  
995     76.0
996     84.0
997     85.0
998     78.0
999     93.0
Name: Metascore, Length: 1000, dtype: float64

In [28]:
# Fill the missing values with 0
movies['Metascore'].fillna(0)

0       80.0
1      100.0
2       84.0
3       90.0
4       96.0
       ...  
995     76.0
996     84.0
997     85.0
998     78.0
999     93.0
Name: Metascore, Length: 1000, dtype: float64

In [46]:
# Fill the missing values with the mean of the column
print(movies['Metascore'].mean())
movies['Metascore'].fillna(movies['Metascore'].mean()).count()



77.97153024911032


1000

In [47]:
movies['Metascore'].count()

# need of inplace=True

843

In [48]:
# Fill the missing values with the median of the column
print(movies['Metascore'].median())
movies['Metascore'].fillna(movies['Metascore'].median()).count()

79.0


1000

In [59]:
# Fill the missing values with the mode of the column
print(movies['Metascore'].mode()[0])
movies['Metascore'].fillna(movies['Metascore'].mode()[0]).count()

type(movies['Metascore'])

76.0


pandas.core.series.Series

In [69]:
# Create a pandas series

a = pd.Series([1, 1, 3, 3, 5])
a.mode()

0    1
1    3
dtype: int64

### Handling Duplicated Data

Duplicate data can occur in a dataset due to data collection errors, merging datasets, or other reasons. Duplicates can negatively impact the analysis and machine learning models, as they can introduce bias and distort insights.

The **`.duplicated()`** function can be used with the parameter `keep` to control which duplicates are marked as `True`:
- `keep='first'` (default): Marks all duplicates except the first occurrence as `True`.
- `keep='last'`: Marks all duplicates except the last occurrence as `True`.
- `keep=False`: Marks all occurrences of the duplicate rows as `True`.

In [73]:
movies.duplicated()

# which rows are duplicated
movies[movies.duplicated()]

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore


In [72]:
movies.duplicated().sum()

0

In [84]:
data = {
    'ID': [1, 2, 3, 4, 5, 3],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward', 'Charlie'],
    'Age': [25, 30, 35, 40, 45, 35],
    'Salary': [50000, 60000, 70000, 80000, 90000, 70000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25,50000
1,2,Bob,30,60000
2,3,Charlie,35,70000
3,4,David,40,80000
4,5,Edward,45,90000
5,3,Charlie,35,70000


In [78]:
# Detecting duplicate rows (marking all occurrences as True)
print("\nDetecting Duplicates:")
duplicates = df.duplicated(keep=False)
print(duplicates)


Detecting Duplicates:
0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool


In [80]:
# Displaying duplicate rows
print("\nDuplicate Rows:")
print(df[df.duplicated(keep=False)])


Duplicate Rows:
   ID     Name  Age  Salary
2   3  Charlie   35   70000
5   3  Charlie   35   70000


In [81]:
movies.drop_duplicates()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,Comedy,7.6,Blake Edwards,Audrey Hepburn,166544,679874270.0,76.0
996,Giant,1956,201,Drama,7.6,George Stevens,Elizabeth Taylor,34075,195217415.0,84.0
997,From Here to Eternity,1953,118,Drama,7.6,Fred Zinnemann,Burt Lancaster,43374,30500000.0,85.0
998,Lifeboat,1944,97,Drama,7.6,Alfred Hitchcock,Tallulah Bankhead,26471,852142728.0,78.0


In [83]:
df_no_duplicates = df.drop_duplicates(keep='first') # Default
df_no_duplicates

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25,50000
1,2,Bob,30,60000
2,3,Charlie,35,70000
3,4,David,40,80000
4,5,Edward,45,90000


In [86]:
# Removing all occurrences of duplicates
df_no_duplicates_all = df.drop_duplicates(keep=False)
print("\nDataset After Removing All Occurrences of Duplicates:")
df_no_duplicates_all


Dataset After Removing All Occurrences of Duplicates:


Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25,50000
1,2,Bob,30,60000
3,4,David,40,80000
4,5,Edward,45,90000


After dropping duplicates, the index of the DataFrame is not reset, which means the original index values remain. To reset the index, you can use `.reset_index(drop=True)`. Whether you should resolve it depends on your analysis needs. If maintaining a sequential index is important, resetting the index is recommended.

In [89]:
df_no_duplicates_all.reset_index(drop=True)

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25,50000
1,2,Bob,30,60000
2,4,David,40,80000
3,5,Edward,45,90000


reset_index(drop=True):
- drop=True discards the old index.
- Without drop=True, the old index is kept as a new column.
- Using drop=True provides a clean DataFrame with a new sequential index.

### Feature Scaling

Feature scaling is a crucial step in data preprocessing that ensures all numerical features contribute equally to the machine learning model. It transforms the values of features so that they share a similar scale, which improves the efficiency and accuracy of many machine learning algorithms.

In [91]:
# Identifying columns that may require feature scaling
numerical_features = movies.select_dtypes(include=['int64', 'float64']).columns
print("\nNumerical Columns:")
print(numerical_features)


Numerical Columns:
Index(['Runtime', 'IMDB_Rating', 'No_of_Votes', 'Gross', 'Metascore'], dtype='object')


In [92]:
# Check if feature scaling is necessary by examining the range of values for each numerical column
for column in numerical_features:
    column_range = movies[column].max() - movies[column].min()
    print(f"Range of '{column}': {column_range}")


Range of 'Runtime': 276
Range of 'IMDB_Rating': 1.700000000000001
Range of 'No_of_Votes': 2318022
Range of 'Gross': 936660920.0
Range of 'Metascore': 72.0


Feature scaling refers to techniques used to standardize or normalize the values of features in a dataset so that they are on a comparable scale. This is particularly important for algorithms sensitive to the scale of data, such as **Support Vector Machines (SVM)**, **K-Nearest Neighbors (KNN)**, and **gradient-based optimization algorithms**

Feature scaling ensures that features do not dominate others simply because of their scale, which can prevent biased results.

feature scaling using **MinMaxScaler** and **StandardScaler** from Scikit-Learn

In [94]:
# Selecting numerical columns to apply feature scaling
features_to_scale = ['Runtime', 'IMDB_Rating', 'No_of_Votes', 'Gross']

In [99]:
movies

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Director,Star1,No_of_Votes,Gross,Metascore
0,The Shawshank Redemption,1994,142,Drama,9.3,Frank Darabont,Tim Robbins,2343110,28341469.0,80.0
1,The Godfather,1972,175,Crime,9.2,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,100.0
2,The Dark Knight,2008,152,Action,9.0,Christopher Nolan,Christian Bale,2303232,534858444.0,84.0
3,The Godfather: Part II,1974,202,Crime,9.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,90.0
4,12 Angry Men,1957,96,Crime,9.0,Sidney Lumet,Henry Fonda,689845,4360000.0,96.0
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,Comedy,7.6,Blake Edwards,Audrey Hepburn,166544,679874270.0,76.0
996,Giant,1956,201,Drama,7.6,George Stevens,Elizabeth Taylor,34075,195217415.0,84.0
997,From Here to Eternity,1953,118,Drama,7.6,Fred Zinnemann,Burt Lancaster,43374,30500000.0,85.0
998,Lifeboat,1944,97,Drama,7.6,Alfred Hitchcock,Tallulah Bankhead,26471,852142728.0,78.0
