<a href="https://colab.research.google.com/github/m-zayed5722/Miscellaneous-Projects/blob/main/GenAI_DS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 5) Binary classification + evaluation metrics

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay, classification_report
)

# Load dataset
data = load_breast_cancer(as_frame=True)
df = data.frame

print("Columns:", df.columns.tolist())
print("\nClass distribution:")
print(df["target"].value_counts())

X = df.drop(columns=["target"])
y = df["target"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_scaled, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test_scaled)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=data.target_names)
disp.plot()
plt.title("Confusion Matrix - Breast Cancer")
plt.show()

# Cross-validation score
cv_scores = cross_val_score(clf, scaler.transform(X), y, cv=5)
print("CV Accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())


In [None]:
# 6) K-Means clustering on Iris dataset + PCA visualization

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Load Iris
iris = load_iris(as_frame=True)
df = iris.frame

X = df.drop(columns=["target"])
y = df["target"]  # true labels (just for evaluation)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: reduce to 2D with PCA for plotting
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# K-Means clustering (we know there are 3 classes in Iris)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Plot in PCA space
plt.figure(figsize=(6, 5))
scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=clusters,
    alpha=0.7
)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Iris - KMeans Clusters (PCA 2D)")
plt.colorbar(scatter, label="Cluster ID")
plt.show()

# Compare clusters to true labels (rough check)
confusion = pd.crosstab(y, clusters, rownames=["True"], colnames=["Cluster"])
print("\nCluster vs True Label:")
print(confusion)


In [None]:
# 7) Simple time series forecasting with ARIMA on synthetic data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA

# Generate synthetic daily data with trend + seasonality + noise
np.random.seed(42)
n_days = 365
time_index = pd.date_range(start="2023-01-01", periods=n_days, freq="D")

trend = np.linspace(10, 20, n_days)                # upward trend
seasonality = 2 * np.sin(2 * np.pi * time_index.dayofyear / 7)  # weekly seasonality
noise = np.random.normal(scale=1.0, size=n_days)

y = trend + seasonality + noise

ts = pd.Series(y, index=time_index)

# Plot original series
ts.plot(figsize=(10, 4), title="Synthetic Daily Series")
plt.xlabel("Date")
plt.ylabel("Value")
plt.show()

# Train/test split (e.g., last 30 days for test)
train = ts.iloc[:-30]
test = ts.iloc[-30:]

# Fit ARIMA model (simple order, you can tune this)
model = ARIMA(train, order=(5,1,0))  # ARIMA(p,d,q)
model_fit = model.fit()
print(model_fit.summary())

# Forecast the next len(test) points
forecast = model_fit.forecast(steps=len(test))

# Plot forecast vs test
plt.figure(figsize=(10, 4))
plt.plot(train.index, train, label="Train")
plt.plot(test.index, test, label="Test", linestyle="--")
plt.plot(test.index, forecast, label="Forecast")
plt.legend()
plt.title("ARIMA Forecast vs Actual")
plt.show()

# Compute simple error metric
mae = np.mean(np.abs(forecast - test))
print(f"Test MAE: {mae:.3f}")


In [None]:
# 8) Text generation using a small GPT-2 model

# pip install transformers torch sentencepiece (if not already installed)

from transformers import pipeline, set_seed

# Create generator pipeline
generator = pipeline(
    "text-generation",
    model="gpt2"  # you can try "gpt2-medium" if your GPU/CPU can handle it
)

set_seed(42)

prompts = [
    "As a data scientist, my main responsibilities are",
    "The future of generative AI in healthcare is",
]

for p in prompts:
    print("=" * 80)
    print("PROMPT:", p)
    outputs = generator(
        p,
        max_length=80,
        num_return_sequences=2,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    for i, out in enumerate(outputs, start=1):
        print(f"\nCompletion {i}:\n{out['generated_text']}\n")


In [None]:
# 9) Zero-shot classification using BART MNLI

# pip install transformers torch sentencepiece

from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli"
)

candidate_labels = ["bug report", "feature request", "praise", "billing issue"]

texts = [
    "The app keeps crashing when I try to upload a file.",
    "Could you add support for dark mode and multiple profiles?",
    "You guys are awesome, the new update is super fast!"
]

for text in texts:
    result = classifier(text, candidate_labels)
    print("\nTEXT:", text)
    for label, score in zip(result["labels"], result["scores"]):
        print(f"  {label:15s} -> {score:.3f}")
