# Extract Table Purge Scores (Fixed)

This notebook connects to the MySQL database and extracts purge score metrics from the `v_table_purge_scores` view.

In [1]:
import pandas as pd
import mysql.connector
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Database connection parameters
DB_CONFIG = {
    'host': 'localhost',
    'user': 'thirdeye_user',
    'password': 'password',
    'database': 'thirdeye'
}

# Create direct MySQL connection
connection = mysql.connector.connect(**DB_CONFIG)
print("Database connection established")

Database connection established


In [5]:
# SQL query to extract purge scores
query = """
SELECT 
    size_score,
    access_staleness_score,
    usage_frequency_score,
    refresh_waste_score,
    user_engagement_score,
    purge_score
FROM v_table_purge_scores
"""

# Execute query and create DataFrame
df = pd.read_sql(query, connection)

print(f"Extracted {len(df)} records")
print(f"DataFrame shape: {df.shape}")

Extracted 161971 records
DataFrame shape: (161971, 6)


In [6]:
# Display basic info about the DataFrame
print("DataFrame Info:")
print(df.info())
print("\nFirst 5 rows:")
df.head()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161971 entries, 0 to 161970
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   size_score              161971 non-null  int64  
 1   access_staleness_score  161971 non-null  int64  
 2   usage_frequency_score   161971 non-null  int64  
 3   refresh_waste_score     161971 non-null  int64  
 4   user_engagement_score   161971 non-null  int64  
 5   purge_score             161971 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 7.4 MB
None

First 5 rows:


Unnamed: 0,size_score,access_staleness_score,usage_frequency_score,refresh_waste_score,user_engagement_score,purge_score
0,10,4,7,7,8,7.2
1,10,1,10,7,10,7.0
2,10,1,10,7,10,7.0
3,10,1,10,7,10,7.0
4,10,1,10,7,10,7.0


In [7]:
# Display summary statistics
print("Summary Statistics:")
df.describe()

Summary Statistics:


Unnamed: 0,size_score,access_staleness_score,usage_frequency_score,refresh_waste_score,user_engagement_score,purge_score
count,161971.0,161971.0,161971.0,161971.0,161971.0,161971.0
mean,2.121942,1.585271,9.119639,6.850912,9.505609,4.219379
std,0.993461,1.188815,1.925265,0.757714,1.063942,0.375665
min,1.0,1.0,1.0,3.0,1.0,2.2
25%,2.0,1.0,10.0,7.0,10.0,3.9
50%,2.0,1.0,10.0,7.0,10.0,4.2
75%,2.0,1.0,10.0,7.0,10.0,4.2
max,10.0,4.0,10.0,7.0,10.0,7.2


In [None]:
# Check for null values
print("Null values per column:")
print(df.isnull().sum())
print(f"\nTotal null values: {df.isnull().sum().sum()}")

In [None]:
# Close the database connection
connection.close()
print("Database connection closed")