# Cleaning DC vs. Marvel IMDB Data

## Import libarries and load data

In [368]:
# Import libaries
import re

import pandas as pd
import numpy as np

In [369]:
# Load data
df = pd.read_csv(r'C:\Users\nguye\Documents\GitHub\spark_shared_repo\group_02\Marvel_DC_imdb.csv')

# Drop unnamed column
df = df.drop(columns=['Unnamed: 0'])

# Show un-cleaned data
df

Unnamed: 0,Movie,Year,Genre,RunTime,Rating,Director,Actor,Description,IMDB_Score,Metascore,Votes,USA_Gross,Category
0,Eternals,(2021),"Action,Adventure,Drama",,,ChloéZhao,"AngelinaJolie,GemmaChan,RichardMadden,BarryKeo...","The saga of the Eternals, a race of immortal b...",,,,,Marvel
1,Loki,(2021– ),"Action,Adventure,Fantasy",,,,"TomHiddleston,OwenWilson,SophiaDiMartino,Richa...",A new Marvel chapter with Loki at its center.,,,,,Marvel
2,The Falcon and the Winter Soldier,(2021),"Action,Adventure,Drama",50 min,TV-14,,"AnthonyMackie,SebastianStan,WyattRussell,ErinK...","Following the events of 'Avengers: Endgame,' S...",7.5,,105557,,Marvel
3,WandaVision,(2021),"Action,Comedy,Drama",350 min,TV-PG,,"ElizabethOlsen,PaulBettany,KathrynHahn,Teyonah...",Blends the style of classic sitcoms with the M...,8.1,,174710,,Marvel
4,Spider-Man: No Way Home,(2021),"Action,Adventure,Sci-Fi",,,JonWatts,"AngourieRice,TomHolland,Zendaya,MarisaTomei",A continuation of Spider-Man: Far From Home.,,,,,Marvel
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42 min,TV-14,GregorySmith,"BrandonRouth,CaityLotz,MaisieRichardson-Seller...","Worlds lived, worlds died. Nothing will ever b...",8.5,,2050,,DC
1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,TV-PG,CarlSeaton,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...","In the wake of Lex Luthor's return, the show f...",8.3,,1259,,DC
1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,TV-PG,AlexisOstrander,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara comes face to face with Red Daughter and ...,8.1,,1053,,DC
1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42 min,TV-PG,ShannonKohli,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara and Lena head to Kaznia to hunt down Lex....,7.4,,1036,,DC


# Clean and Format Data

## Fix columns that should be numericals but are strings instead

For example, the column `"RunTime"` has the words "min" in it. 

In [370]:
# Create function to turn strings to float
def str_to_float(cell, remove = ["M", "min", "$", ","]):
    if isinstance(cell, str):
        for r in remove:
            cell = cell.replace(r, "")
        return float(cell)
    else:
        return cell

# Convert to numerical columns
df['RunTime'] = df['RunTime'].apply(str_to_float)
df['USA_Gross'] = df['USA_Gross'].apply(str_to_float)
df['Votes'] = df['Votes'].apply(str_to_float)

# Show data
df


Unnamed: 0,Movie,Year,Genre,RunTime,Rating,Director,Actor,Description,IMDB_Score,Metascore,Votes,USA_Gross,Category
0,Eternals,(2021),"Action,Adventure,Drama",,,ChloéZhao,"AngelinaJolie,GemmaChan,RichardMadden,BarryKeo...","The saga of the Eternals, a race of immortal b...",,,,,Marvel
1,Loki,(2021– ),"Action,Adventure,Fantasy",,,,"TomHiddleston,OwenWilson,SophiaDiMartino,Richa...",A new Marvel chapter with Loki at its center.,,,,,Marvel
2,The Falcon and the Winter Soldier,(2021),"Action,Adventure,Drama",50.0,TV-14,,"AnthonyMackie,SebastianStan,WyattRussell,ErinK...","Following the events of 'Avengers: Endgame,' S...",7.5,,105557.0,,Marvel
3,WandaVision,(2021),"Action,Comedy,Drama",350.0,TV-PG,,"ElizabethOlsen,PaulBettany,KathrynHahn,Teyonah...",Blends the style of classic sitcoms with the M...,8.1,,174710.0,,Marvel
4,Spider-Man: No Way Home,(2021),"Action,Adventure,Sci-Fi",,,JonWatts,"AngourieRice,TomHolland,Zendaya,MarisaTomei",A continuation of Spider-Man: Far From Home.,,,,,Marvel
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1685,DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42.0,TV-14,GregorySmith,"BrandonRouth,CaityLotz,MaisieRichardson-Seller...","Worlds lived, worlds died. Nothing will ever b...",8.5,,2050.0,,DC
1686,Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,CarlSeaton,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...","In the wake of Lex Luthor's return, the show f...",8.3,,1259.0,,DC
1687,Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,AlexisOstrander,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara comes face to face with Red Daughter and ...,8.1,,1053.0,,DC
1688,Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,ShannonKohli,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara and Lena head to Kaznia to hunt down Lex....,7.4,,1036.0,,DC


## Year Data

Year data is a string instead of numerical. We need to extract the start and end dates from the string.

In [371]:
# Get start and end years as int

def get_start_end_year(s):
    if isinstance(s, (float, int)):
        return s, np.nan
    s = s.replace("(", "")
    s = s.replace(")", "")
    years = re.findall(r'\d+', s)
    if len(years) == 1:
        return int(years[0]), np.nan
    elif len(years) == 2:
        return int(years[0]), int(years[1])

df[["Year_Start", "Year_End"]] = pd.DataFrame(df['Year'].apply(get_start_end_year).tolist(), index=df.index)




## Strip leading and trailing spaces from string columns

In [372]:
# Strip leading and trailing spaces 

str_columns = ['Movie', 'Genre', 'Director', 'Actor', 'Description',]

def strip_spaces(cell):
    if isinstance(cell, str):
        return cell.strip()
    else:
        return cell

for col in str_columns:
    df[col] = df[col].apply(strip_spaces)


## Get information about what type of film it is

In [373]:
# Get information it a series or movie (repeat instances)
df[["Year", "Rating"]] = df[["Year", "Rating"]].astype(str)
count_data = df.groupby(['Movie', 'Year'])['Year_Start'].count()

movie_or_series = []

for _, row in df.iterrows():
    movie = row['Movie']
    year = row['Year']

    if "Video Game" in row["Year"]:
        movie_or_series.append('Video Game')
    elif "TV Movie" in row["Year"]:
        movie_or_series.append('TV Movie')
    elif "Video" in row["Year"]:
        movie_or_series.append('Video')
    elif "TV Special" in row["Year"]:
        movie_or_series.append('TV Special')
    elif "TV Short" in row["Year"]:
        movie_or_series.append('Short')
    elif "Short" in row["Genre"] or row["RunTime"] < 10:
        movie_or_series.append('Short')
    elif "Documentary" in row["Genre"]:
        movie_or_series.append('Documentary')
    elif "TV" in row["Rating"]:
        movie_or_series.append('TV Show')
    elif not pd.isna(row["USA_Gross"]):
        movie_or_series.append('Movie')
    else:
        if count_data.loc[movie, year] == 1:
            movie_or_series.append('Direct-to-Video Movie')
        else:
            movie_or_series.append('TV Show')

df["Type"] = movie_or_series




In [374]:
# Get the counts of each type
df.Type.value_counts()

TV Series                1301
Short                     143
Video                      84
Movie                      64
Direct-to-Video Movie      36
Video Game                 22
TV Movie                   16
Documentary                13
TV Special                 11
Name: Type, dtype: int64

# Exclude data that does not meet the following conditions

- Drop rows missing data in the following rows `["Movie", "Year", "IMDB_Score", "Director"]`
- Remove rows where the type is `['TV Special', 'Video Game', 'Video']`

In [375]:
# Drop data missing in columns, movie, year, IMDB_Score 
df = df.dropna(subset=["Movie", "Year", "IMDB_Score", "Director"])
# Only include Movies and TV Shows
df.query("Type.isin(['Movie', 'TV Show'])", engine="python")

# Rename Columns

Rename the "Movie" column "Name"

In [376]:
df = df.rename(columns={"Movie": "Name"})
df = df.set_index('Name')

df

Unnamed: 0_level_0,Year,Genre,RunTime,Rating,Director,Actor,Description,IMDB_Score,Metascore,Votes,USA_Gross,Category,Year_Start,Year_End,Type
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Avengers: Endgame,(2019),"Action,Adventure,Drama",181.0,PG-13,"AnthonyRusso,JoeRusso","RobertDowneyJr.,ChrisEvans,MarkRuffalo,ChrisHe...",After the devastating events of Avengers: Infi...,8.4,78.0,880911.0,858.37,Marvel,2019.0,,Movie
Guardians of the Galaxy,(2014),"Action,Adventure,Comedy",121.0,PG-13,JamesGunn,"ChrisPratt,VinDiesel,BradleyCooper,ZoeSaldana",A group of intergalactic criminals must pull t...,8.0,76.0,1066222.0,333.18,Marvel,2014.0,,Movie
Spider-Man: Far from Home,(2019),"Action,Adventure,Sci-Fi",129.0,PG-13,JonWatts,"TomHolland,SamuelL.Jackson,JakeGyllenhaal,Mari...",Following the events of Avengers: Endgame (201...,7.5,69.0,348047.0,390.53,Marvel,2019.0,,Movie
Thor: Ragnarok,(2017),"Action,Adventure,Comedy",130.0,PG-13,TaikaWaititi,"ChrisHemsworth,TomHiddleston,CateBlanchett,Mar...","Imprisoned on the planet Sakaar, Thor must rac...",7.9,74.0,615860.0,315.06,Marvel,2017.0,,Movie
Avengers: Infinity War,(2018),"Action,Adventure,Sci-Fi",149.0,PG-13,"AnthonyRusso,JoeRusso","RobertDowneyJr.,ChrisHemsworth,MarkRuffalo,Chr...",The Avengers and their allies must be willing ...,8.4,68.0,881638.0,678.82,Marvel,2018.0,,Movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DC's Legends of Tomorrow,(2016– ),"Action,Adventure,Drama",42.0,TV-14,GregorySmith,"BrandonRouth,CaityLotz,MaisieRichardson-Seller...","Worlds lived, worlds died. Nothing will ever b...",8.5,,2050.0,,DC,2016.0,,TV Series
Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,CarlSeaton,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...","In the wake of Lex Luthor's return, the show f...",8.3,,1259.0,,DC,2015.0,2021.0,TV Series
Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,AlexisOstrander,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara comes face to face with Red Daughter and ...,8.1,,1053.0,,DC,2015.0,2021.0,TV Series
Supergirl,(2015–2021),"Action,Adventure,Drama",42.0,TV-PG,ShannonKohli,"MelissaBenoist,MehcadBrooks,ChylerLeigh,KatieM...",Kara and Lena head to Kaznia to hunt down Lex....,7.4,,1036.0,,DC,2015.0,2021.0,TV Series


# Export Data

In [377]:
df.to_csv(r"Marvel_DC_imdb_cleaned.csv")