In [1]:
#Dependency importation
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading the dataset
df = pd.read_csv("movies.csv")
df

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,
...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,
9995,Arcane,(2021– ),"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,
9996,Heart of Invictus,(2022– ),"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,
9997,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB


In [4]:
df.describe()

Unnamed: 0,RATING,RunTime
count,8179.0,7041.0
mean,6.921176,68.688539
std,1.220232,47.258056
min,1.1,1.0
25%,6.2,36.0
50%,7.1,60.0
75%,7.8,95.0
max,9.9,853.0


In [5]:
# See if there are null values in our dataframe
df.isna().sum()

MOVIES         0
YEAR         644
GENRE         80
RATING      1820
ONE-LINE       0
STARS          0
VOTES       1820
RunTime     2958
Gross       9539
dtype: int64

We can see that our dataframe contains 6 columns having null values.

#### YEAR COLUMN

In [6]:
#defining the function to extract the date 
def extract_years(year_str):
    if pd.isna(year_str):
        return None, None  # Return None for NaN values
    
    # Removing any leading extra characters which is not the year
    new_str = re.sub(r"^[^(]*", "", year_str)
    
    # Extract years from the new string containing no extra character
    years = re.findall(r"\d{4}", new_str)

    if len(years) == 0:
        return None, None
    elif len(years) == 1:
        return years[0], None if '–' in new_str else years[0]
    else:
        return years[0], years[-1]

# Apply the function to the 'YEAR' column and create new columns
df['START_YEAR'], df['END_YEAR'] = zip(*df['YEAR'].apply(extract_years))
df

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,
9995,Arcane,(2021– ),"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,,2021,
9996,Heart of Invictus,(2022– ),"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,
9997,The Imperfects,(2021– ),"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,


In [7]:
#We drop the column YEAR
df = df.drop("YEAR", axis=1)

In [8]:
df.isna().sum()

MOVIES           0
GENRE           80
RATING        1820
ONE-LINE         0
STARS            0
VOTES         1820
RunTime       2958
Gross         9539
START_YEAR     748
END_YEAR      3928
dtype: int64

Our new columns START_YEAR and End_Year have Nan values let's try to manage them

In [9]:
#1-START_YEAR
df["START_YEAR"].unique()

array(['2021', '2010', '2013', '2020', '2006', '2019', '2016', '2011',
       '2005', '2008', '2017', '1994', '2014', '2015', '2003', '2009',
       '2018', '1987', '2012', '2007', '2000', '1993', '1999', '2001',
       '1997', '1989', '1975', '1995', '1984', '1998', '1966', '1990',
       '2002', '1976', '1978', '2022', '1982', '1968', '2004', '1996',
       '1971', '1980', '1962', '1991', '1960', '1988', '1969', '1961',
       '1979', '1956', '1983', None, '1986', '1967', '1974', '1992',
       '1958', '1932', '1941', '1950', '1946', '1981', '1952', '1957',
       '1954', '1955', '1948', '1947', '1977', '2023', '1945', '1953',
       '1985', '1973', '1972', '1965', '1944', '1933', '1938'],
      dtype=object)

In [10]:
#Let us replace the Nan values by the most occuring year :  mode
df["START_YEAR"].mode()


0    2020
Name: START_YEAR, dtype: object

In [11]:
df["START_YEAR"].fillna("2020", inplace=True)
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,
2,The Walking Dead,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,
4,Army of Thieves,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,
9995,Arcane,"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,,2021,
9996,Heart of Invictus,"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,
9997,The Imperfects,"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,


In [12]:
df["START_YEAR"].isna().sum()

0

In [13]:
#End Year
df["END_YEAR"].unique()


array(['2021', None, '2022', '2013', '2004', '2020', '2015', '2019',
       '2014', '2018', '2017', '1994', '2007', '1993', '2016', '2012',
       '2001', '1999', '1998', '1975', '2008', '2006', '1996', '2000',
       '2003', '1969', '1991', '1995', '2002', '1976', '1978', '2005',
       '1968', '2010', '2011', '1971', '1989', '1997', '1980', '2009',
       '1962', '1984', '1960', '1988', '1987', '1961', '1979', '1956',
       '1983', '1990', '1986', '1966', '1967', '1974', '1992', '1958',
       '1932', '1941', '1950', '1946', '1952', '1957', '1954', '1982',
       '1955', '1948', '1947', '1977', '2023', '1945', '1953', '1985',
       '1973', '1972', '1965', '1944', '1933', '1938'], dtype=object)

In [14]:
#We will replace the Nan of the end year by the highest year in the list which is 2023
df["END_YEAR"].fillna("2023", inplace = True)
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,2023
2,The Walking Dead,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,2023
4,Army of Thieves,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,2023
9995,Arcane,"\nAnimation, Action, Adventure",,\nAdd a Plot\n,\n,,,,2021,2023
9996,Heart of Invictus,"\nDocumentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,2023
9997,The Imperfects,"\nAdventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,2023


In [15]:
df["END_YEAR"].isna().sum()

0

In [16]:
df.isna().sum()

MOVIES           0
GENRE           80
RATING        1820
ONE-LINE         0
STARS            0
VOTES         1820
RunTime       2958
Gross         9539
START_YEAR       0
END_YEAR         0
dtype: int64

#### COLUMN GENRE

In [17]:
df["GENRE"]

0           \nAction, Horror, Thriller            
1       \nAnimation, Action, Adventure            
2            \nDrama, Horror, Thriller            
3       \nAnimation, Adventure, Comedy            
4              \nAction, Crime, Horror            
                           ...                    
9994       \nAdventure, Drama, Fantasy            
9995    \nAnimation, Action, Adventure            
9996              \nDocumentary, Sport            
9997       \nAdventure, Drama, Fantasy            
9998       \nAdventure, Drama, Fantasy            
Name: GENRE, Length: 9999, dtype: object

In [18]:
#we use the "str.replace" to remove the Writtenby

df['GENRE'] = df['GENRE'].str.replace("\n" , "")
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",,\nAdd a Plot\n,\n,,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,2023


In [19]:
df["GENRE"].mode()

0    Comedy            
Name: GENRE, dtype: object

In [20]:
df["GENRE"].fillna("comedy", inplace = True)
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",,\nAdd a Plot\n,\n,,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,2023


In [21]:
df.isna().sum()

MOVIES           0
GENRE            0
RATING        1820
ONE-LINE         0
STARS            0
VOTES         1820
RunTime       2958
Gross         9539
START_YEAR       0
END_YEAR         0
dtype: int64

#### COLUMN RATING

In [22]:
df["RATING"]

0       6.1
1       5.0
2       8.2
3       9.2
4       NaN
       ... 
9994    NaN
9995    NaN
9996    NaN
9997    NaN
9998    NaN
Name: RATING, Length: 9999, dtype: float64

In [23]:
#We use linear interpolation cause we are dealing with numerical values and we assume the rate of change between the data points is constant
df['RATING'] = df['RATING'].interpolate(method='linear')
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,2023


In [24]:
df.isna().sum()

MOVIES           0
GENRE            0
RATING           0
ONE-LINE         0
STARS            0
VOTES         1820
RunTime       2958
Gross         9539
START_YEAR       0
END_YEAR         0
dtype: int64

#### COLUMN VOTE

In [25]:
df["VOTES"]

0        21,062
1        17,870
2       885,805
3       414,849
4           NaN
         ...   
9994        NaN
9995        NaN
9996        NaN
9997        NaN
9998        NaN
Name: VOTES, Length: 9999, dtype: object

In [26]:
#we use the "str.replace" to remove the commas (,)
df['VOTES'] = df['VOTES'].str.replace("," , "")

#We convert our values to float
df['VOTES'] = df['VOTES'].astype(float)
df


Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,,,,2021,2023


In [27]:
#We apply linear interpolation

df['VOTES'] = df['VOTES'].interpolate(method='linear')
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,220353.5,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,433.0,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,433.0,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,433.0,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,433.0,,,2021,2023


In [28]:
df.isna().sum()

MOVIES           0
GENRE            0
RATING           0
ONE-LINE         0
STARS            0
VOTES            0
RunTime       2958
Gross         9539
START_YEAR       0
END_YEAR         0
dtype: int64

#### COLUMN RUNTIME

In [29]:
#Renaming our column
df = df.rename(columns = {"RunTime":"RUNTIME"})
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RUNTIME,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,220353.5,,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,433.0,,,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,433.0,,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,433.0,,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,433.0,,,2021,2023


In [30]:
df["RUNTIME"]

0       121.0
1        25.0
2        44.0
3        23.0
4         NaN
        ...  
9994      NaN
9995      NaN
9996      NaN
9997      NaN
9998      NaN
Name: RUNTIME, Length: 9999, dtype: float64

In [31]:
#We replace the Nan values by the mean runtime value
mean = df["RUNTIME"].mean()
mean

68.68853855986366

In [32]:
df["RUNTIME"].fillna(mean, inplace = True)
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RUNTIME,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.000000,,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.000000,,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.000000,,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.000000,,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,220353.5,68.688539,,2021,2021
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,433.0,68.688539,,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,433.0,68.688539,,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,433.0,68.688539,,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,433.0,68.688539,,2021,2023


In [33]:
df.isna().sum()

MOVIES           0
GENRE            0
RATING           0
ONE-LINE         0
STARS            0
VOTES            0
RUNTIME          0
Gross         9539
START_YEAR       0
END_YEAR         0
dtype: int64

#### COLUMN GROSS

In [34]:
#As we can see, there are 9539 Nan values on the GROSS column, which is about 95% of the data,
#So we drop the column

df = df.drop(columns = "Gross")


In [35]:
df.isna().sum()

MOVIES        0
GENRE         0
RATING        0
ONE-LINE      0
STARS         0
VOTES         0
RUNTIME       0
START_YEAR    0
END_YEAR      0
dtype: int64

### Our data do not more contain Nan values

In [36]:
# checking for duplicated rows
df.duplicated().sum()

20

#### There are 20 duplicated rows, so we shall keep only the first occurence of each duplicated row

In [37]:
df = df.drop_duplicates(keep='first')
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RUNTIME,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.000000,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.000000,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.000000,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.000000,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,220353.5,68.688539,2021,2021
...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n \n Stars:\nMorgan Taylor Camp...,433.0,68.688539,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,\n,433.0,68.688539,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,\n Director:\nOrlando von Einsiedel\n| \n ...,433.0,68.688539,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,\n Director:\nJovanka Vuckovic\n| \n Sta...,433.0,68.688539,2021,2023


In [38]:
df.duplicated().sum()

0

#### Our dataset is now clean with no Nan no Duplicated values

Now lets format some columns to be able to easily answer to some question 

e.g:  Who is the director of the film The imperfects? or

    Which starts played in the film Heart of Arcane

#### COLUMN STARS

In [39]:
# Lets remove unneccessay characters 

df['STARS'] = df['STARS'].str.replace("\n" , "")
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RUNTIME,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,Director:Peter Thorwarth| Stars:Peri B...,21062.0,121.000000,2021,2021
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"Stars:Chris Wood, Sarah Michel...",17870.0,25.000000,2021,2023
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"Stars:Andrew Lincoln, Norman R...",885805.0,44.000000,2010,2022
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"Stars:Justin Roiland, Chris Pa...",414849.0,23.000000,2013,2023
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",Director:Matthias Schweighöfer| Stars:...,220353.5,68.688539,2021,2021
...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,"Stars:Morgan Taylor Campbell, ...",433.0,68.688539,2021,2023
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,,433.0,68.688539,2021,2023
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,Director:Orlando von Einsiedel| Star:P...,433.0,68.688539,2022,2023
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,Director:Jovanka Vuckovic| Stars:Morga...,433.0,68.688539,2021,2023


In [40]:
df['STARS']

0           Director:Peter Thorwarth|     Stars:Peri B...
1                       Stars:Chris Wood, Sarah Michel...
2                       Stars:Andrew Lincoln, Norman R...
3                       Stars:Justin Roiland, Chris Pa...
4           Director:Matthias Schweighöfer|     Stars:...
                              ...                        
9994                    Stars:Morgan Taylor Campbell, ...
9995                                                     
9996        Director:Orlando von Einsiedel|     Star:P...
9997        Director:Jovanka Vuckovic|     Stars:Morga...
9998        Director:Jovanka Vuckovic|     Stars:Morga...
Name: STARS, Length: 9979, dtype: object

From the structure we can see that, We can see that, each function is Seperated by a |, so lets extract this function to easy answers to queries



In [41]:
def extract_names(stars):
   
    # Extract directors name
    directors = stars.str.extract('Director:(.*?)\|')
    
    # Extract stars na
    stars = stars.str.extract('Stars:(.*)')
    
    return directors, stars

In [42]:
# Obtaining the names of both the directors and the stars
directors, stars = extract_names(df['STARS'])

# Adding new Columns
df['DIRECTOR'] = directors
df['STAR'] = stars
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RUNTIME,START_YEAR,END_YEAR,DIRECTOR,STAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,Director:Peter Thorwarth| Stars:Peri B...,21062.0,121.000000,2021,2021,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc..."
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"Stars:Chris Wood, Sarah Michel...",17870.0,25.000000,2021,2023,,"Chris Wood, Sarah Michelle Gellar, Lena Headey..."
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"Stars:Andrew Lincoln, Norman R...",885805.0,44.000000,2010,2022,,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"Stars:Justin Roiland, Chris Pa...",414849.0,23.000000,2013,2023,,"Justin Roiland, Chris Parnell, Spencer Grammer..."
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",Director:Matthias Schweighöfer| Stars:...,220353.5,68.688539,2021,2021,Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby..."
...,...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,"Stars:Morgan Taylor Campbell, ...",433.0,68.688539,2021,2023,,"Morgan Taylor Campbell, Chris Cope, Iñaki Godo..."
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,,433.0,68.688539,2021,2023,,
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,Director:Orlando von Einsiedel| Star:P...,433.0,68.688539,2022,2023,Orlando von Einsiedel,
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,Director:Jovanka Vuckovic| Stars:Morga...,433.0,68.688539,2021,2023,Jovanka Vuckovic,"Morgan Taylor Campbell, Iñaki Godoy, Rhianna J..."


In [45]:
df = df.drop(columns = "STARS")
df = df.rename(columns = {"STAR": "STARS"})
df

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,VOTES,RUNTIME,START_YEAR,END_YEAR,DIRECTOR,STARS
0,Blood Red Sky,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,21062.0,121.000000,2021,2021,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc..."
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,17870.0,25.000000,2021,2023,,"Chris Wood, Sarah Michelle Gellar, Lena Headey..."
2,The Walking Dead,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,885805.0,44.000000,2010,2022,,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,414849.0,23.000000,2013,2023,,"Justin Roiland, Chris Parnell, Spencer Grammer..."
4,Army of Thieves,"Action, Crime, Horror",8.4,"\nA prequel, set before the events of Army of ...",220353.5,68.688539,2021,2021,Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby..."
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,433.0,68.688539,2021,2023,,"Morgan Taylor Campbell, Chris Cope, Iñaki Godo..."
9995,Arcane,"Animation, Action, Adventure",8.8,\nAdd a Plot\n,433.0,68.688539,2021,2023,,
9996,Heart of Invictus,"Documentary, Sport",8.8,\nAdd a Plot\n,433.0,68.688539,2022,2023,Orlando von Einsiedel,
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,\nAdd a Plot\n,433.0,68.688539,2021,2023,Jovanka Vuckovic,"Morgan Taylor Campbell, Iñaki Godoy, Rhianna J..."


In [43]:
df.loc[df["STARS"] == Nan, 

0

We will just allow the Nan Values of these two new rows cause we don't have enough information to fill them.

#### COLUMN ONE-LINE

In [46]:
# Here we just remove characters that are not very neccessary 


df['ONE-LINE'] = df['ONE-LINE'].str.replace("\n" , "")
df


Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,VOTES,RUNTIME,START_YEAR,END_YEAR,DIRECTOR,STARS
0,Blood Red Sky,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,21062.0,121.000000,2021,2021,Peter Thorwarth,"Peri Baumeister, Carl Anton Koch, Alexander Sc..."
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,17870.0,25.000000,2021,2023,,"Chris Wood, Sarah Michelle Gellar, Lena Headey..."
2,The Walking Dead,"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,885805.0,44.000000,2010,2022,,"Andrew Lincoln, Norman Reedus, Melissa McBride..."
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,414849.0,23.000000,2013,2023,,"Justin Roiland, Chris Parnell, Spencer Grammer..."
4,Army of Thieves,"Action, Crime, Horror",8.4,"A prequel, set before the events of Army of th...",220353.5,68.688539,2021,2021,Matthias Schweighöfer,"Matthias Schweighöfer, Nathalie Emmanuel, Ruby..."
...,...,...,...,...,...,...,...,...,...,...
9994,The Imperfects,"Adventure, Drama, Fantasy",8.8,Add a Plot,433.0,68.688539,2021,2023,,"Morgan Taylor Campbell, Chris Cope, Iñaki Godo..."
9995,Arcane,"Animation, Action, Adventure",8.8,Add a Plot,433.0,68.688539,2021,2023,,
9996,Heart of Invictus,"Documentary, Sport",8.8,Add a Plot,433.0,68.688539,2022,2023,Orlando von Einsiedel,
9997,The Imperfects,"Adventure, Drama, Fantasy",8.8,Add a Plot,433.0,68.688539,2021,2023,Jovanka Vuckovic,"Morgan Taylor Campbell, Iñaki Godoy, Rhianna J..."
