What is our research question? Potentially: What makes a movie go big at the Box Office/ what makes a movie successful?

## Define function for getting data from API

In [None]:
import requests
import pandas as pd
import time

# Your OMDb API key
api_key = "6e2fc5e0"

# Base URL for OMDb API
base_url = "http://www.omdbapi.com/"

def fetch_movie_data(title, api_key):
    params = {
        "apikey": api_key,
        "t": title
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        data = response.json()
        if data.get("Response") == "True":
            return data
        else:
            print(f"Error: {data.get('Error')}")
    else:
        print(f"HTTP Error: {response.status_code}")
    return None

## Define function for getting data from API and make DataFrame from data

In [None]:
import pandas as pd
import time

# Load the titles
titles_df = pd.read_csv("titles.csv")
titles = titles_df["title"].head(10000)

# Fetch movie data
movie_data = []
for title in titles:
    movie_data.append(fetch_movie_data(title, api_key))
    time.sleep(0.1)  # Rate limiting


FileNotFoundError: [Errno 2] No such file or directory: 'titles.csv'

In [None]:
movie_data_filtered = [data for data in movie_data if data is not None]
movie_df_original = pd.DataFrame(movie_data_filtered)
movie_df_new = movie_df_original.drop(columns=["Writer", "Released", "Poster", "Type", "totalSeasons", "Response", "Website", "Production", "DVD", "imdbID"])
movie_df_new

## Genre revenue over time as a percentage of the total


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
import numpy as np

df = movie_df_new.copy()
# Convert release year to numeric
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

# Convert BoxOffice to numeric (removing dollar signs and commas)
df["BoxOffice"] = df["BoxOffice"].replace({'\$': '', ',': ''}, regex=True)
df["BoxOffice"] = pd.to_numeric(df["BoxOffice"], errors="coerce")

# Drop missing values
df = df.dropna(subset=["Genre", "BoxOffice", "Year"])

# Filter for reasonable years (avoid weird pre-1940 data)
df = df[df["Year"] >= 1940]

# Split genres into separate rows
df_exploded = df.assign(Genre=df["Genre"].str.split(", ")).explode("Genre")

# Standardize genre formatting (remove spaces, unify capitalization)
df_exploded["Genre"] = df_exploded["Genre"].str.strip().str.title()

# Aggregate revenue per genre
total_revenue_per_genre = df_exploded.groupby("Genre")["BoxOffice"].sum()

# Remove unwanted genres while **FORCING inclusion** of key classics
excluded_genres = {"Sport", "Crime", "Biography", "History", "Fantasy",
                   "Mystery", "Thriller", "Music", "Documentary", "Horror", "Tv Movie", "Adventure"}
included_genres = {"Western", "Film-Noir", "Musical"}  # Ensure these are included

# Ensure the classic genres are included even if not in the top revenue genres
top_genres = total_revenue_per_genre.nlargest(25).index
keep_genres = set(top_genres) - excluded_genres  # Remove unwanted genres
keep_genres.update(included_genres)  # **Force inclusion of Western, Film-Noir, Musical**

# **Manually add missing rows for "Film-Noir", "Western", and "Musical"**
for genre in included_genres:
    if genre not in df_exploded["Genre"].unique():
        df_exploded = pd.concat([df_exploded, pd.DataFrame({"Year": [1940], "Genre": [genre], "BoxOffice": [1]})], ignore_index=True)

df_exploded = df_exploded[df_exploded["Genre"].isin(keep_genres)]

# Aggregate revenue per genre per year
genre_trends = df_exploded.groupby(["Year", "Genre"])["BoxOffice"].sum().reset_index()

# Pivot for better visualization
genre_pivot = genre_trends.pivot(index="Year", columns="Genre", values="BoxOffice").fillna(0)

# Compute each genre's percentage of total revenue per year
genre_pivot_relative = genre_pivot.div(genre_pivot.sum(axis=1), axis=0) * 100  # Convert to percentage

# Apply rolling average for smooth trends
roll = 10
genre_pivot_relative = genre_pivot_relative.rolling(window=roll, min_periods=1).mean()

# Generate a modified "husl" palette that is slightly **brighter but distinct**
num_colors = len(genre_pivot_relative.columns)
base_palette = sns.color_palette("husl", n_colors=num_colors)

# Adjust colors to be slightly **brighter & more saturated**
brighter_palette = [(mcolors.rgb_to_hsv(color)[0],
                     min(mcolors.rgb_to_hsv(color)[1] * 1.1, 1),
                     min(mcolors.rgb_to_hsv(color)[2] * 1.2, 1))
                    for color in base_palette]
brighter_palette = [mcolors.hsv_to_rgb(color) for color in brighter_palette]  # Convert back to RGB

# --- Stacked Area Chart to Show Genre Share Over Time ---
fig, ax = plt.subplots(figsize=(14, 7))  # Wider figure for better readability
genre_pivot_relative.plot(kind="area", stacked=True, alpha=0.85, color=brighter_palette, ax=ax)

plt.xlabel("Year")
plt.ylabel("Percentage of Total Box Office Revenue")
plt.title(f"Major Movie Genres Over Time (Relative Market Share, {roll}-Year Rolling Avg)")
plt.grid(axis="y", linestyle="--", linewidth=0.5)

# Fix legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, loc="upper left", bbox_to_anchor=(1, 1), fontsize=8, title="Genre", ncol=2)

plt.show()


## Percent of Total Box Office Revenue From Top 5 Movies Over Time

In [None]:
import pandas as pd
import plotly.express as px

# Load dataset
df = pd.read_csv("movies.csv", on_bad_lines="skip", nrows=40000)

# Convert release_date to datetime and extract year
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
df["release_year"] = df["release_date"].dt.year

# Convert revenue to numeric
df["revenue"] = pd.to_numeric(df["revenue"], errors="coerce")

# Drop missing values
df = df.dropna(subset=["revenue", "release_year"])

# Filter dataset to only include movies from 1940 onward
df_1940_present = df[df["release_year"] >= 1940]

# Compute total revenue per year
total_revenue_per_year = df_1940_present.groupby("release_year")["revenue"].sum()

# Identify the top 5 highest-grossing movies for each year
top_movies_per_year = df_1940_present.groupby("release_year").apply(lambda x: x.nlargest(5, "revenue")).reset_index(drop=True)

# Compute percentage of total revenue from top 5 movies
top_movies_revenue_per_year = top_movies_per_year.groupby("release_year")["revenue"].sum()
top_movies_percentage = (top_movies_revenue_per_year / total_revenue_per_year) * 100

# Convert to DataFrame for visualization
top_movies_percentage_df = top_movies_percentage.reset_index()
top_movies_percentage_df.columns = ["release_year", "percentage_of_total_revenue"]

# Create an interactive bar chart showing the percentage of revenue from the top 5 movies each year
fig = px.bar(
    top_movies_percentage_df,
    x="release_year",
    y="percentage_of_total_revenue",
    text="percentage_of_total_revenue",
    labels={"percentage_of_total_revenue": "Percentage of Total Revenue", "release_year": "Year"},
    title="Percentage of Total Box Office Revenue from Top 5 Movies (1940-Present)",
    color="percentage_of_total_revenue"
)

# Update layout for better readability
fig.update_traces(texttemplate="%{text:.1f}%", textposition="outside")
fig.update_layout(xaxis=dict(type="category"), yaxis_tickformat=".1f")

# Show the interactive plot
fig.show()


Genre Distribution for Top 5 Studios

In [None]:
movie_df_new.to_csv("movies1.csv", index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("movies.csv", on_bad_lines="skip", nrows=10000)

# Drop missing values in key columns
df = df.dropna(subset=["genres", "production_companies"])

# Split genres and production companies into separate lists
df["genre_list"] = df["genres"].str.split(", ")
df["studio_list"] = df["production_companies"].str.split(", ")

# Explode both genres and studios into separate rows
df_exploded = df.explode("genre_list").explode("studio_list")

# Remove Documentary and Western genres
df_exploded = df_exploded[~df_exploded["genre_list"].isin(["Documentary", "Western"])]

# Identify top 5 studios by total movie count
top_studios = df_exploded["studio_list"].value_counts().nlargest(5).index.tolist()

# Filter dataset to only include movies from these top 5 studios
df_top_studios = df_exploded[df_exploded["studio_list"].isin(top_studios)]

# Compute movie count per genre for top 5 studios
studio_genre_distribution = df_top_studios.groupby(["studio_list", "genre_list"]).size().reset_index(name="movie_count")

# Pivot to have studios as rows and genres as columns
studio_pivot = studio_genre_distribution.pivot(index="studio_list", columns="genre_list", values="movie_count").fillna(0)

# Normalize so each studio's total movies sum to 1 (convert counts to proportions)
studio_pivot_normalized = studio_pivot.div(studio_pivot.sum(axis=1), axis=0)

# Generate unique colors for each genre
unique_genres = studio_pivot_normalized.columns
colors = plt.cm.get_cmap("tab20", len(unique_genres))  # Assign distinct colors
color_map = {genre: colors(i) for i, genre in enumerate(unique_genres)}

# --- Stacked Bar Chart: Top 5 Studios & Their Genre Distribution (Normalized) ---
studio_pivot_normalized.plot(kind="bar", stacked=True, figsize=(12, 6), color=[color_map[genre] for genre in studio_pivot_normalized.columns])

plt.xlabel("Studio")
plt.ylabel("Proportion of Movies Produced")
plt.title("Normalized Genre Distribution for Top 5 Studios (Excluding Documentary & Western)")
plt.xticks(rotation=45, ha="right")
plt.legend(title="Genre", bbox_to_anchor=(1, 1))
plt.grid(True, linestyle="--", linewidth=0.5)
plt.show()


# Lula's Visualizations

### Extracting ratings from the ratings column

In [None]:
import numpy as np
movie_df_copy = movie_df_copy.replace("N/A", np.nan)

In [None]:
import pandas as pd
import numpy as np

def extract_numeric(value):
    """ Convert rating values from different formats to a 0-100 scale. """
    if isinstance(value, str):
        if "/" in value:  # Handles '8.8/10' or '74/100'
            num, denom = value.split("/")
            return (float(num) / float(denom)) * 100
        elif "%" in value:  # Handles '87%'
            return float(value.strip('%'))
    return np.nan  # Return NaN for unexpected formats

# Explode the "Ratings" column (this assumes "Ratings" contains lists of dictionaries)
df_exploded = movie_df_copy.explode("Ratings").reset_index(drop=True)

# Convert the "Ratings" column into a DataFrame (each dictionary becomes a column)
ratings_df = pd.json_normalize(df_exploded["Ratings"])

# Apply the function to convert ratings to a common 0-100 scale
ratings_df["Numeric Value"] = ratings_df["Value"].apply(extract_numeric)

# Drop the old "Ratings" column
df_exploded = df_exploded.drop(columns=["Ratings"])

# Merge the exploded DataFrame with the new ratings DataFrame
df_final = df_exploded.join(ratings_df)

# Pivot the ratings DataFrame to make each rater a column
df_pivoted = df_final.pivot_table(
    index=['Title'],  # Keep the movie title as the index
    columns='Source',  # Create columns for each rating source
    values='Numeric Value',  # Populate the cells with the numeric values of ratings
    aggfunc='first'  # If there are multiple ratings, take the first
).reset_index()

# Merge the pivoted ratings columns back into the original df_final
df_final_with_ratings = pd.merge(df_final, df_pivoted, on="Title", how="left")

# Show the final DataFrame with ratings from multiple sources as columns
df_final_with_ratings


In [None]:
import pandas as pd
import plotly.express as px

# Make a copy of the dataframe
df = df_final_with_ratings.copy()

# Convert rating columns to numeric, coercing any errors to NaN
df['Internet Movie Database'] = pd.to_numeric(df['Internet Movie Database'], errors='coerce')
df['Metacritic'] = pd.to_numeric(df['Metacritic'], errors='coerce')
df['Rotten Tomatoes'] = pd.to_numeric(df['Rotten Tomatoes'], errors='coerce')

# Split the genres in 'Genre' column and explode them to create one genre per row
df['Genre'] = df['Genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
df_exploded = df.explode('Genre')

# Group by Genre and calculate the average rating for each rating source
avg_ratings_by_genre = df_exploded.groupby('Genre')[['Internet Movie Database', 'Metacritic', 'Rotten Tomatoes']].mean().reset_index()

# Plot the data using a bar plot
fig = px.bar(avg_ratings_by_genre,
             x='Genre',
             y=['imdbRating', 'Metacritic', 'Rotten Tomatoes'],
             title="Average Ratings by Genre from Different Rating Agencies",
             labels={'value': 'Average Rating', 'Genre': 'Movie Genre'},
             barmode='group',  # Group bars side by side
             height=600)

# Show the plot
fig.show()


In [None]:
# Count occurrences of each country
import plotly.express as px
country_counts = movie_df_new['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.bar(country_counts.head(20), x='Country', y='Count',
             title='Most Common Countries of Movie Origin',
             labels={'Country': 'Country of Origin', 'Count': 'Number of Movies'})
fig.show()

In [None]:
#same thing but for languages
language_counts = movie_df_new['Language'].value_counts().reset_index()
language_counts.columns = ['Language', 'Count']

fig = px.bar(language_counts.head(20), x='Language', y='Count',
             title='Most Common Languages for Movies',
             labels={'Language': 'Language of Movie', 'Count': 'Number of Movies'})
fig.show()

In [None]:
boxoffice_awards = movie_df_new[['Title', 'BoxOffice', 'Awards']].copy()
boxoffice_awards

In [None]:
import re
import pandas as pd
import plotly.express as px

# Copy dataframe
boxoffice_awards = movie_df_new[['Title', 'BoxOffice', 'Awards']].copy()

# Convert BoxOffice to numeric
boxoffice_awards['BoxOffice'] = pd.to_numeric(
    boxoffice_awards['BoxOffice'].astype(str).str.replace('[$,]', '', regex=True),
    errors='coerce'
)

# Function to extract number of wins from Awards column
def extract_wins(awards):
    if isinstance(awards, str):
        match = re.search(r'(\d+) win', awards)
        if match:
            return int(match.group(1))
    return 0

boxoffice_awards['Wins'] = boxoffice_awards['Awards'].apply(extract_wins)

# Define properly matched box office bins and labels
bins = [0, 50_000_000, 200_000_000, 400_000_000, 650_000_000, float('inf')]
labels = ['0-50M', '50M-200M', '200M-400M', '400M-650M', '650M+']

# Categorize movies by box office revenue
boxoffice_awards['Box Office Category'] = pd.cut(boxoffice_awards['BoxOffice'], bins=bins, labels=labels)

# Group by box office category and calculate total wins & movie count
boxoffice_summary = boxoffice_awards.groupby('Box Office Category').agg(
    total_wins=('Wins', 'sum'),
    movie_count=('Title', 'count')
).reset_index()

# Compute average awards per movie for each category
boxoffice_summary['Avg Wins per Movie'] = boxoffice_summary['total_wins'] / boxoffice_summary['movie_count']

# Custom color palette for better differentiation
color_map = {
    '0-50M': 'crimson',
    '50M-200M': 'darkorange',
    '200M-400M': 'gold',
    '400M-650M': 'mediumseagreen',
    '650M+': 'royalblue'
}

# Create bar chart for average awards per movie
fig = px.bar(
    boxoffice_summary,
    x='Box Office Category',
    y='Avg Wins per Movie',
    title="Box Office Categories vs. Average Awards per Movie",
    labels={'Avg Wins per Movie': 'Average Awards per Movie'},
    category_orders={'Box Office Category': labels},
    color='Box Office Category',
    color_discrete_map=color_map,
    text=boxoffice_summary['Avg Wins per Movie'].round(2)  # Round to 2 decimal places
)

# Improve layout and readability
fig.update_traces(marker=dict(line=dict(width=1, color='black')))  # Add black outlines for contrast
fig.update_layout(
    title=dict(font=dict(size=18, family="Arial")),
    xaxis_title="Box Office Category",
    yaxis_title="Avg Awards per Movie",
    xaxis=dict(tickangle=-30),  # Rotate labels for spacing
    font=dict(size=12),
    showlegend=False  # No need for a legend since categories are on the x-axis
)

# Show the improved visualization
fig.show()


# Machine Learning

In [None]:
movie_df_new['Genre'] = movie_df_new['Genre'].str.split(',').str[0]
movie_df_new['Genre'] = movie_df_new['Genre'].str.strip()
movie_df_new['Runtime'] = movie_df_new['Runtime'].str.extract('(\d+)').astype(float)
movie_df_new['imdbVotes'] = pd.to_numeric(movie_df_new['imdbVotes'].astype(str).str.replace(',', '', regex=True), errors='coerce')
movie_df_new['Internet Movie Database'] = movie_df_new['Internet Movie Database'].apply(extract_numeric)
movie_df_new['BoxOffice'] = pd.to_numeric(movie_df_new['BoxOffice'].str.replace(r'[$,]', '', regex=True), errors='coerce')

In [None]:
#predicting the genre based on a myriad of different features
#train on director, plot, ratings, box office
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd # Added import for pd.DataFrame

features = ['Title', 'imdbRating', 'Genre', 'Metacritic', 'Rotten Tomatoes']
X, y = movie_df_new[features], movie_df_new['BoxOffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.dropna()
vectorizer = TfidfVectorizer()

transformer = make_column_transformer(
    (StandardScaler(), ['imdbRating', 'Metacritic', 'Rotten Tomatoes']),
    (vectorizer, "Title"), # Use the pre-defined vectorizer
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), ["Genre"]), # Changed sparse to False
    remainder="drop"
)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # Explicitly import SimpleImputer
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd

features = ['Title', 'imdbRating', 'Genre', 'Metacritic', 'Rotten Tomatoes']
X, y = movie_df_new[features], movie_df_new['BoxOffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop rows with missing values in 'BoxOffice' from both X_train and y_train
train_data = pd.concat([X_train, y_train], axis=1)
train_data.dropna(subset=['BoxOffice'], inplace=True)
X_train = train_data[features]
y_train = train_data['BoxOffice']

vectorizer = TfidfVectorizer()

# Incorporate SimpleImputer to handle NaNs in numerical features
transformer = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()),
     ['imdbRating', 'Metacritic', 'Rotten Tomatoes']), # Pipeline for numerical features
    (vectorizer, "Title"),
    (OneHotEncoder(handle_unknown='ignore', sparse_output=False), ["Genre"]),
    remainder="drop"
)

from sklearn.ensemble import RandomForestRegressor
model = make_pipeline(transformer, RandomForestRegressor())
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

In [None]:
def clean_year(year_str):
    if isinstance(year_str, str):
        numeric_part = re.sub(r'\D', '', year_str)
        if numeric_part:
            return int(numeric_part)
    return np.nan

movie_df_new['Year'] = movie_df_new['Year'].apply(clean_year)


In [None]:
from itertools import combinations
candidate_features = ['Runtime', 'Metascore', 'imdbRating', 'imdbVotes', 'Internet Movie Database', 'Metacritic', 'Rotten Tomatoes', 'Wins', 'Year']
for features in combinations(candidate_features, 4):
  print(features)

  # Note that `features` is a tuple.
  # You will need to convert it to a list to subset your `DataFrame`.
movie_df_new[list(features)]

('Runtime', 'Metascore', 'imdbRating', 'imdbVotes')
('Runtime', 'Metascore', 'imdbRating', 'Internet Movie Database')
('Runtime', 'Metascore', 'imdbRating', 'Metacritic')
('Runtime', 'Metascore', 'imdbRating', 'Rotten Tomatoes')
('Runtime', 'Metascore', 'imdbRating', 'Wins')
('Runtime', 'Metascore', 'imdbRating', 'Year')
('Runtime', 'Metascore', 'imdbVotes', 'Internet Movie Database')
('Runtime', 'Metascore', 'imdbVotes', 'Metacritic')
('Runtime', 'Metascore', 'imdbVotes', 'Rotten Tomatoes')
('Runtime', 'Metascore', 'imdbVotes', 'Wins')
('Runtime', 'Metascore', 'imdbVotes', 'Year')
('Runtime', 'Metascore', 'Internet Movie Database', 'Metacritic')
('Runtime', 'Metascore', 'Internet Movie Database', 'Rotten Tomatoes')
('Runtime', 'Metascore', 'Internet Movie Database', 'Wins')
('Runtime', 'Metascore', 'Internet Movie Database', 'Year')
('Runtime', 'Metascore', 'Metacritic', 'Rotten Tomatoes')
('Runtime', 'Metascore', 'Metacritic', 'Wins')
('Runtime', 'Metascore', 'Metacritic', 'Year')
('

Unnamed: 0,Metacritic,Rotten Tomatoes,Wins,Year
0,74.0,87.0,159,2010
1,74.0,73.0,44,2014
2,84.0,94.0,164,2008
3,83.0,81.0,91,2009
4,69.0,91.0,39,2012
...,...,...,...,...
990,29.0,13.0,3,2001
991,42.0,31.0,10,2009
992,52.0,69.0,10,2002
993,41.0,51.0,1,1998


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score


pipeline = make_pipeline(
    (SimpleImputer(strategy='mean')),
    (StandardScaler()),
    (RandomForestRegressor())
)



rmse = {}
for size in range(1, 7):
  # TODO: iterate over all feature combinations of a given size
  for features in combinations(candidate_features, size):
    feature_string = ",".join(features)
    # TODO: estimate test error for a model with the given features
    # Impute missing values in the feature set before cross-validation
    X_temp = movie_df_new[list(features)].copy()

    # Apply SimpleImputer to fill NaNs in X_temp
    imputer = SimpleImputer(strategy='mean')
    X_temp = imputer.fit_transform(X_temp)

    # Remove corresponding rows with NaNs in y
    y_temp = movie_df_new["BoxOffice"].dropna()
    X_temp = X_temp[~np.isnan(movie_df_new["BoxOffice"])]

    rmse[feature_string] = np.sqrt(-cross_val_score(
        pipeline,
        X=X_temp,  # Use the imputed data
        y=y_temp,  # Use the data without NaNs
        scoring="neg_mean_squared_error",
        cv=4
    ).mean())

# Sort the accuracies.
rmses_sorted = pd.Series(rmse).sort_values()
rmses_sorted

Unnamed: 0,0
"imdbRating,imdbVotes,Internet Movie Database,Metacritic,Rotten Tomatoes,Wins",1.098925e+08
"Metascore,imdbRating,imdbVotes,Internet Movie Database,Rotten Tomatoes,Wins",1.105464e+08
"Metascore,imdbRating,imdbVotes,Metacritic,Rotten Tomatoes,Wins",1.107948e+08
"imdbVotes,Internet Movie Database,Metacritic,Rotten Tomatoes,Wins",1.108516e+08
"Runtime,imdbVotes,Internet Movie Database,Metacritic,Rotten Tomatoes,Wins",1.110958e+08
...,...
"imdbRating,Internet Movie Database,Rotten Tomatoes",1.482785e+08
"Internet Movie Database,Year",1.483625e+08
"imdbRating,Internet Movie Database,Year",1.484125e+08
"imdbRating,Year",1.484316e+08
