In [3]:
# --------------------------CODE BY MANDI DISHA--------------------------
import psycopg2
import pandas as pd
from psycopg2 import OperationalError

def connect_db():
    """Connect to the PostgreSQL database and return the connection and cursor."""
    try:
        conn = psycopg2.connect(
            host="localhost",
            database="postgres",  # Connected to the 'postgres' database
            user="postgres",
            port="5432",
            password="mandi"
        )
        print("Connection successful!")
        return conn, conn.cursor()
    except OperationalError as e:
        print(f"Connection failed: {e}")
        return None, None

def fetch_movie_data(cur):
    """Run the SQL query and return the results as a Pandas DataFrame."""
    query = """
    SELECT
        m.rating, 
        COUNT(s.title) AS movie_count,
        ROUND(AVG(s.production_budget), 2) AS avg_production_budget,  
        ROUND(AVG(m.userscore), 2) AS avg_userscore,  
        ROUND(AVG(s.worldwide_box_office), 2) AS avg_box_office,
        ROUND((AVG(s.worldwide_box_office) - AVG(s.production_budget)) / AVG(s.production_budget) * 100, 2) AS avg_profit_percentage
    FROM
        sales_v20 s
    JOIN
        movies m
    ON
        s.title = m.title
    AND
        s.release_year = EXTRACT(YEAR FROM m.reldate)
    WHERE
        s.production_budget IS NOT NULL
    AND
        m.userscore IS NOT NULL 
    GROUP BY
        m.rating  
    ORDER BY
        avg_production_budget DESC, 
        avg_userscore DESC,  
        avg_box_office DESC;
    """
    cur.execute(query)
    # Fetch all rows from the query
    rows = cur.fetchall()
    
    # Get column names
    colnames = [desc[0] for desc in cur.description]

    # Return the data as a pandas DataFrame
    return pd.DataFrame(rows, columns=colnames)

def close_db(conn, cur):
    """Close the database connection and cursor."""
    cur.close()
    conn.close()

# Main function to encapsulate everything
def main():
    conn, cur = connect_db()
    if conn and cur:
        movie_data = fetch_movie_data(cur)
        print("Data from the query:")
        print(movie_data.head())  # Display the first few rows

        # Perform some analysis
        # Example: Describe the data
        print("\nSummary statistics:")
        print(movie_data.describe())

        # Example: Sort by average profit percentage
        sorted_data = movie_data.sort_values(by="avg_profit_percentage", ascending=False)
        print("\nTop ratings by profit percentage:")
        print(sorted_data.head())

        # Close the database connection
        close_db(conn, cur)

if __name__ == "__main__":
    main()


#  [END CODE BY MANDI DISHA]





Connection failed: FATAL:  password authentication failed for user "postgres"



In [6]:
#Done by Anastasiya 
#This code connects to a PostgreSQL database, cleans up movie data 
#by removing records with missing values (e.g., for production budget or worldwide box office), 
#and then runs an SQL query to analyze movie performance metrics. 
#The query retrieves the average production budget, metascore, 
#and worldwide box office grouped by the movie's release month. 

import psycopg2
import pandas as pd
from psycopg2 import OperationalError

def connect_db():
    """Connect to the PostgreSQL database and return the connection and cursor."""
    try:
        conn = psycopg2.connect(
            host="localhost",
            database="postgres",  # Connected to the 'postgres' database
            user="postgres",
            port="5432",
            password="Foundation.23"
        )
        print("Connection successful!")
        return conn, conn.cursor()
    except OperationalError as e:
        print(f"Connection failed: {e}")
        return None, None

def fetch_movie_data(cur):
    """Run the SQL query and return the results as a Pandas DataFrame."""
    query = """
    SELECT 
        EXTRACT(MONTH FROM m.reldate) AS release_month,  -- Extracting the month from the release date
        ROUND(AVG(s.production_budget), 2) AS avg_production_budget,  -- Average production budget
        ROUND(AVG(m.metascore), 2) AS avg_metascore,  -- Average metascore
        ROUND(AVG(s.worldwide_box_office), 2) AS avg_box_office  -- Average worldwide box office
    FROM
        sales s
    INNER JOIN
        movies m
    ON
        s.title = m.title
    AND
        s.release_year = EXTRACT(YEAR FROM m.reldate)
    WHERE
        s.production_budget IS NOT NULL  -- Only calculate for movies with known budgets
    AND
        m.metascore IS NOT NULL  -- Only calculate for movies with known metascores
    GROUP BY
        EXTRACT(MONTH FROM m.reldate)  -- Grouping by the month
    ORDER BY
        avg_production_budget DESC,  -- Then by production budget (descending)
        avg_metascore,
        release_month;
    """
    cur.execute(query)
    
    # Fetch all rows from the query
    rows = cur.fetchall()
    
    # Get column names
    colnames = [desc[0] for desc in cur.description]

    # Return the data as a pandas DataFrame
    return pd.DataFrame(rows, columns=colnames)

def clean_data(cur):
    """Delete rows with NULL values in production_budget or worldwide_box_office."""
    delete_query = """
    DELETE FROM sales
    WHERE production_budget IS NULL
    OR worldwide_box_office IS NULL;
    """
    cur.execute(delete_query)


def close_db(conn, cur):
    """Close the database connection and cursor."""
    cur.close()
    conn.close()

# Main function to encapsulate everything
def main():
    conn, cur = connect_db()
    if conn and cur:
        # Clean the data first by removing null/empty values
        clean_data(cur)

        # Fetch the movie data after cleaning
        movie_data = fetch_movie_data(cur)
        print("Data from the query:")
        print(movie_data.head())  # Display the first few rows

        # Perform some analysis
        # Example: Describe the data
        print("\nSummary statistics:")
        print(movie_data.describe())

        # Example: Sort by 'avg_production_budget' (since 'avg_profit_percentage' doesn't exist)
        sorted_data = movie_data.sort_values(by="avg_production_budget", ascending=False)
        print("\nTop movies by production budget:")
        print(sorted_data.head())

        # Commit any changes (in case of DELETE) and close the database connection
        conn.commit()
        close_db(conn, cur)

# Fix the main function entry point
if __name__ == "__main__":
    main()

Connection successful!
Data from the query:
  release_month avg_production_budget avg_metascore avg_box_office
0             5           63597304.48         56.15   205076656.27
1             6           59944112.86         56.54   195103356.17
2             7           55723245.93         55.85   177124325.11
3            11           53939179.75         59.77   160756398.24
4            12           51481538.46         60.52   163654119.23

Summary statistics:
       release_month avg_production_budget avg_metascore avg_box_office
count             12                    12            12             12
unique            12                    12            12             12
top                5           63597304.48         56.15   205076656.27
freq               1                     1             1              1

Top movies by production budget:
  release_month avg_production_budget avg_metascore avg_box_office
0             5           63597304.48         56.15   205076656.27
1    

In [None]:
import psycopg2                        # Library for connecting to PostgreSQL
import pandas as pd                    # Pandas for data manipulation
import statsmodels.api as sm           # For statistical models like OLS regression
import matplotlib.pyplot as plt        # For plotting
import seaborn as sns                  # For advanced visualization
import numpy as np                     # For numerical operations
from psycopg2 import OperationalError   # For handling database connection errors

try:
    # Establish a connection to the PostgreSQL database
    conn = psycopg2.connect(
        host="localhost",               # Hostname or IP of the database
        database="postgres",            # Database name
        user="postgres",                # Username to access the database
        password="doriss"               # Password for the user
    )
    print("Connection successful!")

    # Create a cursor object to execute SQL queries
    cur = conn.cursor()

    # SQL query to create a joined table sales_movies2 from sales2 and movies2
    query = """
    CREATE TABLE IF NOT EXISTS sales_movies2 AS
    SELECT
        b.title,                               -- Column from the first table (sales)
        b.genre AS genre_box_office,           -- Genre from the first table, renamed for clarity
        b.worldwide_box_office,                -- Worldwide box office from the first table
        b.production_budget,                   -- Production budget from the first table
        b.url AS url_box_office,               -- URL from the first table, renamed for clarity
        m.genre AS genre_movies_info,          -- Genre from the second table
        m.RelDate AS release_date_movies_info  -- Release date from the second table, renamed for clarity
    FROM
        sales2 b                               -- First table (sales data)
    INNER JOIN
        movies2 m                              -- Second table (movies info)
    ON
        b.title = m.title                      -- Match on title
    AND
        b.release_year = EXTRACT(YEAR FROM m.RelDate) -- Match on year of release
    WHERE
        m.genre LIKE '%Romance%' OR m.genre LIKE '%Action%' OR m.genre LIKE '%Comedy%' OR m.genre LIKE '%Drama%';  -- Include multiple genres
    """

    # Execute the SQL query to create the joined table
    cur.execute(query)
    conn.commit()
    print("Table 'sales_movies2' created successfully.")

    # SQL query to retrieve relevant columns for regression analysis from the new table
    query = """
    SELECT
        title,
        genre_box_office,   -- Retrieve the genre from the box office table
        production_budget,
        worldwide_box_office
    FROM
        sales_movies2
    WHERE
        production_budget IS NOT NULL
        AND worldwide_box_office IS NOT NULL;
    """

    # Execute the SQL query
    cur.execute(query)

    # Fetch all rows from the query
    rows = cur.fetchall()

    # Extract column names from the cursor description
    colnames = [desc[0] for desc in cur.description]

    # Load the query result into a pandas DataFrame
    df = pd.DataFrame(rows, columns=colnames)

    # Convert numeric columns to proper format, handling any invalid data as NaN
    df['production_budget'] = pd.to_numeric(df['production_budget'], errors='coerce')
    df['worldwide_box_office'] = pd.to_numeric(df['worldwide_box_office'], errors='coerce')

    # Display the first few rows of the DataFrame to verify data integrity
    print(df.head())

    # Handle missing or infinite values by replacing them with NaN and dropping those rows
    df_clean = df.replace([np.inf, -np.inf], np.nan).dropna(
        subset=['production_budget', 'worldwide_box_office']
    )

    # Plot multiple regression lines based on different genres using Seaborn's lmplot
    sns.lmplot(x='production_budget', 
               y='worldwide_box_office', 
               hue='genre_box_office',     # Use genre_box_office for differentiation by genre
               data=df_clean, 
               height=7, 
               aspect=1.5, 
               ci=None,                     # Disable confidence interval
               palette='Set1',               # Use a color palette for differentiation
               scatter_kws={'s': 50})        # Size of the scatter points

    # Customize the plot with title and labels
    plt.title('Regression Analysis: Production Budget vs Worldwide Box Office by Genre (Box Office)')
    plt.xlabel('Production Budget (in USD)')
    plt.ylabel('Worldwide Box Office (in USD)')
    
    # Show the plot
    plt.show()

    # Close the database cursor and connection after completion
    cur.close()
    conn.close()

except OperationalError as e:
    # Handle any errors that occur during the database connection
    print(f"The error '{e}' occurred")

In [None]:
import psycopg2                        # Library for connecting to PostgreSQL
import pandas as pd                    # Pandas for data manipulation
import statsmodels.api as sm           # For statistical models like OLS regression
import matplotlib.pyplot as plt        # For plotting
import seaborn as sns                  # For advanced visualization
import numpy as np                     # For numerical operations
from psycopg2 import OperationalError   # For handling database connection errors

try:
    # Establish a connection to the PostgreSQL database
    conn = psycopg2.connect(
        host="localhost",               # Hostname or IP of the database
        database="postgres",            # Database name
        user="postgres",                # Username to access the database
        password="doriss"               # Password for the user
    )
    print("Connection successful!")

    # Create a cursor object to execute SQL queries
    cur = conn.cursor()

    # SQL query to create a joined table sales_movies2 from sales2 and movies2
    query = """
    CREATE TABLE IF NOT EXISTS sales_movies2 AS
    SELECT
        b.title,                               -- Column from the first table (sales)
        b.genre AS genre_box_office,           -- Genre from the first table, renamed for clarity
        b.worldwide_box_office,                -- Worldwide box office from the first table
        b.production_budget,                   -- Production budget from the first table
        b.url AS url_box_office,               -- URL from the first table, renamed for clarity
        m.genre AS genre_movies_info,          -- Genre from the second table
        m.RelDate AS release_date_movies_info  -- Release date from the second table, renamed for clarity
    FROM
        sales2 b                               -- First table (sales data)
    INNER JOIN
        movies2 m                              -- Second table (movies info)
    ON
        b.title = m.title                      -- Match on title
    AND
        b.release_year = EXTRACT(YEAR FROM m.RelDate) -- Match on year of release
    WHERE
        m.genre LIKE '%Romance%' OR m.genre LIKE '%Action%' OR m.genre LIKE '%Comedy%' OR m.genre LIKE '%Drama%';  -- Include multiple genres
    """

    # Execute the SQL query to create the joined table
    cur.execute(query)
    conn.commit()
    print("Table 'sales_movies2' created successfully.")

    # SQL query to retrieve relevant columns for regression analysis from the new table
    query = """
    SELECT
        title,
        genre_box_office,   -- Retrieve the genre from the box office table
        production_budget,
        worldwide_box_office
    FROM
        sales_movies2
    WHERE
        production_budget IS NOT NULL
        AND worldwide_box_office IS NOT NULL;
    """

    # Execute the SQL query
    cur.execute(query)

    # Fetch all rows from the query
    rows = cur.fetchall()

    # Extract column names from the cursor description
    colnames = [desc[0] for desc in cur.description]

    # Load the query result into a pandas DataFrame
    df = pd.DataFrame(rows, columns=colnames)

    # Convert numeric columns to proper format, handling any invalid data as NaN
    df['production_budget'] = pd.to_numeric(df['production_budget'], errors='coerce')
    df['worldwide_box_office'] = pd.to_numeric(df['worldwide_box_office'], errors='coerce')

    # Display the first few rows of the DataFrame to verify data integrity
    print(df.head())

    # Handle missing or infinite values by replacing them with NaN and dropping those rows
    df_clean = df.replace([np.inf, -np.inf], np.nan).dropna(
        subset=['production_budget', 'worldwide_box_office']
    )

    # Plot multiple regression lines based on different genres using Seaborn's lmplot
    sns.lmplot(x='production_budget', 
               y='worldwide_box_office', 
               hue='genre_box_office',     # Use genre_box_office for differentiation by genre
               data=df_clean, 
               height=7, 
               aspect=1.5, 
               ci=None,                     # Disable confidence interval
               palette='Set1',               # Use a color palette for differentiation
               scatter_kws={'s': 50})        # Size of the scatter points

    # Customize the plot with title and labels
    plt.title('Regression Analysis: Production Budget vs Worldwide Box Office by Genre (Box Office)')
    plt.xlabel('Production Budget (in USD)')
    plt.ylabel('Worldwide Box Office (in USD)')
    
    # Show the plot
    plt.show()

    # Close the database cursor and connection after completion
    cur.close()
    conn.close()

except OperationalError as e:
    # Handle any errors that occur during the database connection
    print(f"The error '{e}' occurred")