# Level 1 Tasks — Cognifyz Data Science Internship
This notebook performs **all Level 1 tasks** using the provided dataset.
Tasks include:
1. Data Exploration and Preprocessing
2. Descriptive Analysis
3. Geospatial Analysis


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import folium
from pathlib import Path

# Path to dataset
DATA_PATH = r"/mnt/data/Dataset .csv"

# Create output directory
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)


## Load Dataset

In [None]:
# Load dataset
df = pd.read_csv(DATA_PATH)
df.head()


## Task 1: Data Exploration and Preprocessing

In [None]:
# Shape of dataset
print("Dataset shape:", df.shape)

# Missing values
missing = df.isnull().sum()
print("\nMissing values per column:\n", missing)

# Handle missing 'Cuisines'
if missing.get('Cuisines',0) > 0:
    df['Cuisines'] = df['Cuisines'].fillna("Unknown")

# Convert 'Has Table booking' and 'Has Online delivery' to binary
for col in ['Has Table booking', 'Has Online delivery']:
    if col in df.columns:
        df[col + " (bin)"] = df[col].map(lambda x: 1 if str(x).strip().lower() in ['yes','y','true','1'] else 0)

# Save cleaned dataset
df.to_csv(out_dir / "dataset_cleaned_level1.csv", index=False)

# Plot aggregate rating distribution
plt.figure(figsize=(8,5))
plt.hist(df['Aggregate rating'].dropna(), bins=20)
plt.title('Distribution of Aggregate Rating')
plt.xlabel('Aggregate rating')
plt.ylabel('Count')
plt.grid(True)
plt.show()


## Task 2: Descriptive Analysis

In [None]:
# Basic statistics for numeric columns
desc = df.describe()
display(desc)

# Top categories for categorical columns
categorical_cols = ['Country Code','City','Cuisines']
for c in categorical_cols:
    if c in df.columns:
        display(df[c].value_counts().head(20))

# Top cuisines
df['Cuisines_list'] = df['Cuisines'].astype(str).apply(lambda x: [i.strip() for i in x.split(',')])
cuisines_exploded = df.explode('Cuisines_list')
display(cuisines_exploded['Cuisines_list'].value_counts().head(20))

# Top cities
display(df['City'].value_counts().head(20))


## Task 3: Geospatial Analysis

In [None]:
# Map of restaurants
map_df = df.dropna(subset=['Latitude','Longitude'])
m = folium.Map(location=[map_df['Latitude'].mean(), map_df['Longitude'].mean()], zoom_start=2)
for idx, r in map_df.iterrows():
    try:
        folium.CircleMarker(location=[r['Latitude'], r['Longitude']],
                            radius=2,
                            popup=str(r.get('Restaurant Name','')),
                            fill=True).add_to(m)
    except Exception:
        continue

m.save(str(out_dir / "restaurants_map.html"))

# Scatter plot longitude vs latitude
plt.figure(figsize=(8,6))
plt.scatter(map_df['Longitude'], map_df['Latitude'], s=6, alpha=0.6)
plt.title('Restaurant Locations (Longitude vs Latitude)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Mean rating by city
city_rating = df.groupby('City')['Aggregate rating'].mean().sort_values(ascending=False)
display(city_rating.head(20))
