In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

# Load data set

In [None]:
path = Path.cwd() / '..' / 'files' / 'csv' / 'james_bond_data.csv'
james_bond_data = pd.read_csv(path).convert_dtypes()
james_bond_data.head()
james_bond_data

Heading | Meaning
--------|--------
Release | The release date of the movie
Movie | The title of the movie
Bond | The actor playing the title role
Bond_Car_MFG | The manufacturer of James Bond’s car
US_Gross | The movie’s gross US earnings
World_Gross | The movie’s gross worldwide earnings
Budget ($ 000s) | The movie’s budget, in thousands of US dollars
Film_Length | The running time of the movie
Avg_User_IMDB | The average user rating from IMDb
Avg_User_Rtn_Tom | The average user rating from Rotten Tomatoes
Martinis | The number of martinis that Bond drank in the movie
Kills_Bond | The number of killed enemies


In [None]:
# Structure of data set
print(james_bond_data.shape) 
# print(james_bond_data.dtypes)
print(james_bond_data.info(memory_usage='deep'))

In [None]:
# Creating meaningful column names 
new_column_names = {
    "Release": "release_date",
    "Movie": "movie_title",
    "Bond": "bond_actor",
    "Bond_Car_MFG": "car_manufacturer",
    "US_Gross": "income_usa",
    "World_Gross": "income_world",
    "Budget ($ 000s)": "movie_budget",
    "Film_Length": "film_length",
    "Avg_User_IMDB": "imdb",
    "Avg_User_Rtn_Tom": "rotten_tomatoes",
    "Martinis": "martinis_consumed",
    "Kills_Bond": "bond_kills",
   }
data = james_bond_data.rename(columns=new_column_names)
print(data.columns)
data


# Dealing With Missing Data

In [None]:
mask = data.isna().any(axis="columns")
mask

In [None]:
data.loc[data.isna().any(axis="columns")]

In [None]:
foo = pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
foo

In [None]:
data = james_bond_data.rename(columns=new_column_names).combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
print(data.loc[data.isna().any(axis="columns")])
data

# Correcting Invalid Data Types

In [None]:
# remove $ and , in financial related columns 
data[["income_usa", "income_world", "movie_budget", "film_length"]].head()

In [None]:
#foo1 = data["income_usa"].replace("[$,]", "", regex=True).astype("Float64")
#foo2 = data["income_world"].replace("[$,]", "", regex=True).astype("Float64")
foo3 = data["movie_budget"].replace("[$,]", "", regex=True).astype("Float64")
foo3

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64")))

In [None]:
data.info()

In [None]:
data["film_length"].head(3)

In [None]:
# remove mins from film_length
foo = data["film_length"].str.removesuffix("mins").astype("Int64")
foo.head(3)

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64"),
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64"))))

In [None]:
data['release_date'].head(3)

In [None]:
# adjust release date as datatime type and add a new release_year column
#foo1 = pd.to_datetime(data["release_date"], format="%B, %Y")
foo2 = pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64")
foo2.head(3)

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64"),
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64"))))

In [None]:
data.info()

# Fixing Inconsistencies in Data

In [None]:
# movie_budget * 1,000 to make amount align
foo = data["movie_budget"].replace("[$,]", "", regex=True).astype("Float64")*1000
foo.head(3)

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64") * 1000,
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64"))))

In [None]:
# Check spelling errors
data["bond_actor"].value_counts()

In [None]:
foo = (data["bond_actor"] 
.str.replace("Shawn", "Sean") 
.str.replace("MOORE", "Moore"))

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64") * 1000,
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64")),
            bond_actor = lambda data: (data["bond_actor"].str.replace("Shawn", "Sean").str.replace("MOORE", "Moore"))))

In [None]:
data['car_manufacturer'].value_counts()

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64") * 1000,
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64")),
            bond_actor = lambda data: (data["bond_actor"].str.replace("Shawn", "Sean").str.replace("MOORE", "Moore")),
            car_manufacturer = lambda data:(data["car_manufacturer"].str.replace("Astin","Aston"))))

In [None]:
data['car_manufacturer'].value_counts()

In [None]:
# Checking for Invalid Outliers
data[["film_length", "martinis_consumed"]].describe()

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64") * 1000,
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")).replace(1200, 120),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64")),
            bond_actor = lambda data: (data["bond_actor"].str.replace("Shawn", "Sean").str.replace("MOORE", "Moore")),
            car_manufacturer = lambda data:(data["car_manufacturer"].str.replace("Astin","Aston")),
            martinis_consumed = lambda data:(data["martinis_consumed"].replace(-6, 6))))

# Removing Duplicate Data

In [None]:
mask = data.duplicated(keep=False)
mask

In [None]:
data.loc[data.duplicated(keep=False)]

In [None]:
data = (james_bond_data
        .rename(columns=new_column_names)
        .combine_first(pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}}))
        .assign(
            income_usa = lambda data: (data["income_usa"].replace("[$,]", "", regex=True)).astype("Float64"),
            income_world = lambda data: (data["income_world"].replace("[$,]", "", regex=True)).astype("Float64"),
            movie_budget = lambda data: (data["movie_budget"].replace("[$,]", "", regex=True)).astype("Float64") * 1000,
            film_length = lambda data: (data["film_length"].str.removesuffix("mins").astype("Int64")).replace(1200, 120),
            release_date = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y")),
            release_year = lambda data: (pd.to_datetime(data["release_date"], format="%B, %Y").dt.year.astype("Int64")),
            bond_actor = lambda data: (data["bond_actor"].str.replace("Shawn", "Sean").str.replace("MOORE", "Moore")),
            car_manufacturer = lambda data:(data["car_manufacturer"].str.replace("Astin","Aston")),
            martinis_consumed = lambda data:(data["martinis_consumed"].replace(-6, 6)))
        .drop_duplicates(ignore_index=True))

# Storing Your Cleansed Data

In [None]:
path = Path.cwd() / '..' / 'files' / 'csv' /'james_bond_data_cleansed.csv'

data.to_csv(path, index=False)

# Performing Data Analysis Using Python

In [None]:
# Q1: Regression analysis to see if the Rotten Tomatoes and IMDb rating sets are related.
import pandas as pd
import matplotlib.pyplot as plt
path = Path.cwd() / '..' / 'files' / 'csv' /'james_bond_data_cleansed.csv'
data = pd.read_csv(path).convert_dtypes()

In [None]:
data.info()

In [None]:
fig, ax = plt.subplots()
ax.scatter(data["imdb"], data["rotten_tomatoes"])
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDb Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")
fig.show()

In [None]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
x = data.loc[:, ["imdb"]]
y = data.loc[:, "rotten_tomatoes"]
regr.fit(x, y)
r_squared = f"R-Squared: {regr.score(x, y):.2f}"
best_fit = f"y = {regr.coef_[0]:.4f}x{regr.intercept_:+.4f}"
y_pred = regr.predict(x)
print(r_squared)
print(best_fit)
print('y_pred: ',y_pred)
print('x: ', x)


In [None]:
fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_pred, color="red")
ax.text(7.25, 5.5, r_squared, fontsize=10)
ax.text(7.25, 7, best_fit, fontsize=10)
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDb Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")
fig.show()

In [None]:
# Q2: Are there any insights to be gleaned from analyzing the lengths of the movies?
# prepare dataset
length = data["film_length"].value_counts(bins=7).sort_index()
print(type(length))
length

In [None]:
fig, ax = plt.subplots()
length.plot.bar(
     ax=ax,
     title="Film Length Distribution",
     xlabel="Time Range (mins)",
     ylabel="Count",
)
fig.show()

In [None]:
data["film_length"].agg(["min", "max", "mean", "std"])

In [None]:
# Q3 Is there a relationship between the number of enemies James Bond has killed and 
# the user ratings of the movie in which they were killed?
fig, ax = plt.subplots()
ax.scatter(data["imdb"], data["bond_kills"])
ax.set_title("Scatter Plot of Kills vs Ratings")
ax.set_xlabel("Average IMDb Rating")
ax.set_ylabel("Kills by Bond")
fig.show()

# Communicating Your Findings

In [None]:
'''communicate your findings to other interested parties. 
After all, they’re not For Your Eyes Only. 
You could do this using a report or presentation. 
You’ll likely discuss your data sources and analysis methodology before stating your conclusions. 
Having the data and methodology behind your conclusions gives them authority. 
'''

# Conclusion

In [None]:
# The importance of a data analysis workflow
# The purpose of the main stages in a data analysis workflow
# Common techniques for cleansing data
# How to use some common data analysis methods to meet objectives
# How to display the results of a data analysis graphically.