In [None]:
# Import the drive module from google.colab
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# After mounting, you can access files in


In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the unemployment dataset
# Replace 'path_to_your_csv_file.csv' with the actual path to your CSV file
df = pd.read_csv('/content/drive/MyDrive/cipherbyte/unemployment/Unemployment_Rate_upto_11_2020 - Unemployment_Rate_upto_11_2020.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Print column names to verify
print("Column names:", df.columns)

# Ensure there are no leading or trailing whitespaces in the column names
df.columns = df.columns.str.strip()

# Check if the necessary columns exist and rename if necessary
required_columns = {'Date', 'Unemployment_Rate'}
missing_columns = required_columns - set(df.columns)
if missing_columns:
    print(f"Missing columns: {missing_columns}")
    print(f"Available columns: {df.columns}")
    # Example renaming - modify as needed
    if 'Estimated Unemployment Rate (%)' in df.columns:
        df.rename(columns={'Estimated Unemployment Rate (%)': 'Unemployment_Rate'}, inplace=True)
    if 'Date' not in df.columns and 'date' in df.columns:
        df.rename(columns={'date': 'Date'}, inplace=True)

# Verify the columns after renaming
print("Columns after renaming (if any):", df.columns)

# Check if the columns now exist
if 'Unemployment_Rate' not in df.columns or 'Date' not in df.columns:
    raise ValueError("Required columns are missing after renaming. Please check the dataset.")

# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Drop missing values if any
df = df.dropna()

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Plot the unemployment rate over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='Date', y='Unemployment_Rate')
plt.title('Unemployment Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Unemployment Rate (%)')
plt.show()

# If the dataset includes multiple countries
if 'Country' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df, x='Date', y='Unemployment_Rate', hue='Country')
    plt.title('Unemployment Rate Over Time by Country')
    plt.xlabel('Date')
    plt.ylabel('Unemployment Rate (%)')
    plt.legend(title='Country')
    plt.show()

# Display summary statistics
print("\nSummary statistics of the unemployment rate:")
print(df['Unemployment_Rate'].describe())

# Select numeric columns for correlation matrix
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Correlation matrix for numeric columns
print("\nCorrelation matrix:")
print(df[numeric_columns].corr())

# If there are other numerical columns to compare
if len(numeric_columns) > 1:
    plt.figure(figsize=(10, 6))
    sns.heatmap(df[numeric_columns].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

# Focused analysis on Covid-19 period
covid_start = '2020-03-01'  # Example date, adjust as needed
df_covid = df[df['Date'] >= covid_start]

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_covid, x='Date', y='Unemployment_Rate')
plt.title('Unemployment Rate During Covid-19')
plt.xlabel('Date')
plt.ylabel('Unemployment Rate (%)')
plt.show()

if 'Country' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df_covid, x='Date', y='Unemployment_Rate', hue='Country')
    plt.title('Unemployment Rate During Covid-19 by Country')
    plt.xlabel('Date')
    plt.ylabel('Unemployment Rate (%)')
    plt.legend(title='Country')
    plt.show()
