<a href="https://www.kaggle.com/code/nomanrafi28/covid-19?scriptVersionId=210660584" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer


In [None]:
df = pd.read_csv('/kaggle/input/covid19-dataset/covid_data.csv')

In [None]:
df

In [None]:
df.shape

In [None]:
for col_name in df.columns:
  if df[col_name].dtypes == 'object':
    cat_column = len(df[col_name].unique())
    print(f"{col_name} has {cat_column}-unique values")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe

In [None]:
df.info()

In [None]:
for col_name in df.columns:
  if df[col_name].dtypes == 'object':
    cat_column = len(df[col_name].unique())
    print(f"{col_name} has {cat_column} -- unique values")

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
missing_threshold = 0.5 * len(df)
df = df.dropna(thresh=missing_threshold, axis=1)

In [None]:
df.shape

In [None]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
df.loc[:, numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
df.loc[:, categorical_cols] = imputer.fit_transform(df[categorical_cols])

In [None]:
# Make an explicit copy of df to avoid SettingWithCopyWarning
df = df.copy()

# Check if 'date' column exists in the DataFrame
if 'date' in df.columns:
    # Convert 'date' column to datetime, setting invalid parsing as NaT (Not a Time)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Drop rows where 'date' could not be parsed
    df = df.dropna(subset=['date']).copy()

    # Create new columns for month, day, and day_of_week
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek

    # Drop the original 'date' column if no longer needed
    df = df.drop(columns=['date'])
else:
    print("The 'date' column is not present in the dataset.")

# Preview the processed dataset
print(df.head())


In [None]:
df

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
label_encoder = LabelEncoder()
for col in ['continent','location','iso_code']:  # Specify categorical columns here
    df[col] = label_encoder.fit_transform(df[col])

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df)

In [None]:
df

In [None]:
df 

In [None]:
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
df.duplicated().sum()

In [None]:
for col_name in df.columns:
  if df[col_name].dtypes == 'object':
    cat_column = len(df[col_name].unique())
    print(f"{col_name} has {cat_column} -- unique values")

In [None]:
df.columns

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Data Heatmap')
plt.show()

In [None]:
# Plot total cases over time
# plt.figure(figsize=(12, 6))
# df.groupby('date')['total_cases'].sum().plot()
# plt.title('Global Total COVID-19 Cases Over Time')
# plt.xlabel('Date')
# plt.ylabel('Total Cases')
# plt.show()

In [None]:
# Plot total deaths over time
# plt.figure(figsize=(12, 6))
# df.groupby('date')['total_deaths'].sum().plot(color='red')
# plt.title('Global Total COVID-19 Deaths Over Time')
# plt.xlabel('Date')
# plt.ylabel('Total Deaths')
# plt.show()

In [None]:
# df['date'] = pd.to_datetime(df['date'])
# df['year'] = df['date'].dt.year
# df['month'] = df['date'].dt.month

In [None]:
# Testing Capacity vs Cases
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x='total_cases', y='new_cases', hue='location')
# plt.title('Testing Capacity vs New COVID-19 Cases')
# plt.xlabel('Total Tests')
# plt.ylabel('New Cases')
# plt.legend(loc='upper right')
# plt.show()