In [None]:
import pandas as pd
import sqlite3

In [None]:
db_path = 'survey.db3'
query = "SELECT * FROM support;"

support_df = pd.read_sql_query(query, f"sqlite:///{db_path}")
print(support_df.shape[0])

query = "SELECT * FROM survey;"
survey_df = pd.read_sql_query(query, f"sqlite:///{db_path}")
print(survey_df.shape[0])

### Question 2: Transform (5 marks)  

#### Data Schema for the `support` Table  

| Column Name     | Data Type  | Description  |
|----------------|-----------|---------------------------------------------------------|
| `id`           | Discrete  | The unique identifier of the support ticket. |
| `customer_id`  | Discrete  | The unique identifier of the customer. |
| `category`     | Nominal   | The category of the support request, can be one of **Feedback, Billing Enquiry, Bug, Installation Problem, Other**.  |
| `status`       | Nominal   | The current status of the support ticket, one of **Open, In Progress, or Resolved**. |
| `creation_date` | Discrete  | The date the ticket was created. |
| `response_time` | Discrete  | The number of days taken to respond to the support ticket. |
| `resolution_time` | Continuous | The number of hours taken to resolve the support ticket, rounded to 2 decimal places.  |

- Perform **data cleaning** on the `support` table.  
- Make sure that the data type are correct as specified in the above table. 
- Replace missing values according to the specified criteria in the below table.
- You can use the SQLite **COALESCE** function or you can use any Python function.

#### **Data Cleaning Requirements**  

| Column Name       | Transformation Rule |
|------------------|---------------------------------------------------------|
| `customer_id`    | Replace missing/incorrect values with **0** if any. |
| `category`       | Replace missing/incorect  values with **"Other"** if any. |
| `status`         | Replace missing/incorrect values with **"Resolved"** if any. |
| `response_time`  | Replace missing/incorrect values with **0** if any.|
---

In [None]:
cat_list = ['Feedback', 'Billing enquiry', 'Bug', 'Installation Problem', 'Other']
support_df['category'] = support_df['category'].fillna('Other')
support_df['category'] = support_df['category'].apply(lambda x: x if x in cat_list else 'Other')

status_list = ['Open', 'In Progress', 'Resolved']
support_df['status'] = support_df['status'].apply(lambda x: x if x in status_list else 'Resolved')

support_df['resolution_time'] = support_df['resolution_time'].str.strip().replace(r'(\d+)\s+hours', r'\1', regex=True)

support_df['response_time'] = support_df['response_time'].astype(int)
support_df['resolution_time'] = support_df['resolution_time'].astype(float)
support_df.dtypes

### Question 3: Transform (5 marks)

- Write an **SQL Query** to calculate the minimum and maximum response time for each category of support ticket. 
- Fix the missing/incorrect values as described in the above table. 
- Your output should include the columns `category`, `min_response` and `max_response`. 
- Values should be rounded to two decimal places where appropriate. 
- Use the `pd.read_sql()` function to fetch the data and return as a `pandas` DataFrame.
---

In [None]:
db_path = 'survey.db3'
query = """ 
    SELECT 
        CASE 
            WHEN category IS NULL OR category = '-' THEN 'Other' 
            ELSE category
        END AS new_category,
        MIN(response_time) AS min_response, 
        MAX(response_time) AS max_response
    FROM support
    GROUP BY new_category;
    """

cat_df = pd.read_sql_query(query, f"sqlite:///{db_path}")
cat_df.columns = ['category', 'min_response', 'max_response']
cat_df

In [None]:
# check from Python
response_df = support_df.groupby('category')['response_time'].agg(min='min', max='max').reset_index()
response_df.columns = ['category', 'min_response', 'max_response']
response_df

In [None]:
cat_df.to_csv('respond_time.csv', index = False)