# Welcome to Colab Enterprise <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">

Connect to a Runtime and begin!

<div class="markdown-google-sans">

## **Getting started**
</div>

The document you are reading is not a static web page, but an interactive environment called a **Colab notebook** that lets you write and execute code.

For example, the code cell below uses **numpy** to generate some random data, and uses **matplotlib** to visualize it. To edit the code, just click the cell and start editing.

In [128]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [129]:
from google.cloud import bigquery
project_id = 'ml-real-estate-363920'
client = bigquery.Client(project=project_id)

In [130]:
dataset_ref = client.dataset("imdb", project="bigquery-public-data")
table_ref = dataset_ref.table("reviews")
table = client.get_table(table_ref)

query = """
SELECT date, state, positive_tests_viral, total_tests_viral, death
FROM `bigquery-public-data.covid19_tracking.state_testing_and_outcomes`
WHERE date = '2021-01-01' AND positive_tests_viral IS NOT NULL AND total_tests_viral IS NOT NULL AND death IS NOT NULL
ORDER BY state
LIMIT 1000
"""
query_job = client.query(query)  # API request

In [131]:
results = query_job.result()  # Waits for job to complete.

import pandas as pd
df = pd.DataFrame([dict(row) for row in results])
print(df)

          date state  positive_tests_viral  total_tests_viral  death
0   2021-01-01    AK                 54763            1275750    206
1   2021-01-01    CT                225168            4593460   5995
2   2021-01-01    FL               1692583           14041159  21990
3   2021-01-01    GA                549842            5401054  10958
4   2021-01-01    IA                260753            2228904   3898
5   2021-01-01    IN                589048            5730043   8371
6   2021-01-01    KY                182341            3148606   2623
7   2021-01-01    MA                437155           10944699  12423
8   2021-01-01    MD                340048            5761534   5942
9   2021-01-01    ME                 24975            1095386    347
10  2021-01-01    MI                618299            8202753  13018
11  2021-01-01    MO                440219            3665980   5540
12  2021-01-01    NE                191386            1755820   1651
13  2021-01-01    OH              

### Creating Target Data

**High Death Rate:**
Typically above 5%.
These are usually associated with highly lethal viruses in specific outbreaks. For example, certain hemorrhagic fevers like Ebola can have death rates exceeding 50% in some outbreaks.

**Medium Death Rate:**
Ranges from about 1% to 5%.
This category might be observed in more severe outbreaks of certain influenza strains or in other diseases that are serious but not as uniformly lethal as those in the high category.

**Low Death Rate:**
Generally below 1%.
This would include many common viral infections, like seasonal influenza in most years, which typically have a low mortality rate, especially in populations with access to medical care.


In [132]:
df['death_rate'] = df['death'] / df['positive_tests_viral']
df['death_rate'] = df['death_rate'].apply(lambda x: 'high' if x > 0.05 else ('medium' if x > 0.01 else 'low'))
df

Unnamed: 0,date,state,positive_tests_viral,total_tests_viral,death,death_rate
0,2021-01-01,AK,54763,1275750,206,low
1,2021-01-01,CT,225168,4593460,5995,medium
2,2021-01-01,FL,1692583,14041159,21990,medium
3,2021-01-01,GA,549842,5401054,10958,medium
4,2021-01-01,IA,260753,2228904,3898,medium
5,2021-01-01,IN,589048,5730043,8371,medium
6,2021-01-01,KY,182341,3148606,2623,medium
7,2021-01-01,MA,437155,10944699,12423,medium
8,2021-01-01,MD,340048,5761534,5942,medium
9,2021-01-01,ME,24975,1095386,347,medium


In [133]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Feature Selection
X = df[['positive_tests_viral','total_tests_viral', 'death']]
y = df['death_rate']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalization
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=2)  # You can change the number of neighbors
knn.fit(X_train, y_train)

# Model Evaluation
y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[0 0 0]
 [0 1 1]
 [1 0 2]]
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         0
         low       1.00      0.50      0.67         2
      medium       0.67      0.67      0.67         3

    accuracy                           0.60         5
   macro avg       0.56      0.39      0.44         5
weighted avg       0.80      0.60      0.67         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [134]:
# Assuming you have a trained model named `knn` and normalized test data `X_test_normalized`
predictions = knn.predict(X_test)

# Assuming you know the column names
column_names = ['positive_tests_viral', 'total_tests_viral', 'death'] # Replace with your actual column names
X_test_df = pd.DataFrame(scaler.inverse_transform(X_test), columns=column_names)

X_test_df['Predictions'] = predictions
print(X_test_df)

merged_df = df.merge(X_test_df, on='positive_tests_viral', how='left')

   positive_tests_viral  total_tests_viral   death Predictions
0              156276.0          2652670.0  1477.0         low
1               24975.0          1095386.0   347.0        high
2               54763.0          1275750.0   206.0      medium
3              340048.0          5761534.0  5942.0      medium
4              367855.0          3148938.0  5296.0      medium


In [127]:
predicted_df = merged_df.drop(columns=['total_tests_viral_y', 'death_y'])
predicted_df = predicted_df.rename(columns={'total_tests_viral_x': 'total_tests_viral'}, inplace=False)
predicted_df = predicted_df.rename(columns={'death_x': 'death'}, inplace=False)
predicted_df.dropna()

Unnamed: 0,date,state,positive_tests_viral,total_tests_viral,death,death_rate,Predictions
0,2021-01-01,AK,54763,1275750,206,low,medium
8,2021-01-01,MD,340048,5761534,5942,medium,medium
9,2021-01-01,ME,24975,1095386,347,medium,medium
15,2021-01-01,OR,156276,2652670,1477,low,low
17,2021-01-01,SC,367855,3148938,5296,medium,medium
