In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read in csv
movies = pd.read_csv("IMDb movies.csv")
print(movies.shape)
movies.head(1)

In [None]:
# Remove rows w/NaN values from pertinent columns
movies = movies[~movies.budget.isna()]
movies = movies[~movies.worlwide_gross_income.isna()]
movies = movies[~movies.metascore.isna()]

In [None]:
# Remove $ and , from budget/income columns
movies['budget'] = movies['budget'].replace({'\$': '', ',': ''}, regex=True)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].replace({'\$': '', ',': ''}, regex=True)

In [None]:
# Remove non-US currencies (all have text) from budget column
movies = movies[pd.to_numeric(movies['budget'], errors='coerce').notnull()]

In [None]:
# Convert currency columns to floats
movies['budget'] = movies['budget'].astype(float)
movies['worlwide_gross_income'] = movies['worlwide_gross_income'].astype(float)

In [None]:
movies['worlwide_gross_income'] = round(movies['worlwide_gross_income'], 2)

In [None]:
# Calculate Revenue % Column
movies['revenue_percent'] = round(((movies['worlwide_gross_income'] - movies['budget']) / movies['budget']) * 100, 2)

In [None]:
# Remove extraneous columns & reset index
movies.drop(columns=['title', 'date_published', 'description', 'budget', 'usa_gross_income',
                     'worlwide_gross_income', 'reviews_from_users', 'reviews_from_critics'], inplace=True)
movies.set_index('imdb_title_id', inplace=True)

In [None]:
print(movies.shape)
movies.head()

In [None]:
# Save to file
movies.to_csv('moviesClean.csv', index=False)