In [None]:
# Author: Riya Narwal
# Goal: Clean and analyze SMILES data for drug discovery.
# Week 1: EDA & Preprocessing on SMILES Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load Data
try:
    df = pd.read_csv('DDH_Data_with_Properties.csv')
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ Error: File not found. Check the filename/path.")
    exit()

# 2. Basic Info
print("\nDataset Info:\n")
print(df.info())

# 3. Shape
print(f"\nDataset shape: {df.shape}")

# 4. Check for duplicates
duplicates = df.duplicated()
print(f"\nNumber of duplicate rows: {duplicates.sum()}")
if duplicates.sum() > 0:
    df = df.drop_duplicates()
    print("Duplicates removed.")
    print(f"New shape: {df.shape}")

# 5. Check for missing values
print("\nMissing values per column:\n", df.isnull().sum())
df = df.dropna()  # or you can fillna()
print(f"New shape after removing nulls: {df.shape}")

# 6. Check for inconsistent data (basic)
print("\nColumn names:\n", df.columns)
# Let's assume the SMILES column is named 'SMILES'
if 'SMILES' in df.columns:
    print("\nExample SMILES strings:\n", df['SMILES'].head())
else:
    print("No SMILES column found — update the column name accordingly.")
